BabaK07 commited on
Commit
aefb7b1
·
1 Parent(s): 62d2116

feat(document-matching): simplify to LLM-based document filtering

Browse files

- Replace two-stage keyword + LLM reranking with pure LLM semantic filtering
- Update document matching to send all user documents to LLM for analysis
- LLM now selects 0 to N relevant documents based on actual relevance instead of forced limits
- Rename _llm_verify_document_hashes to _llm_filter_documents for clarity
- Add X-Session-Id header support for improved session tracking in ask_question endpoint
- Update README to reflect simplified document filtering approach
- Add session ID logging for debugging agent thread IDs
- Improve LLM prompt to emphasize quality over quantity in document selection

README.md CHANGED
@@ -27,8 +27,8 @@ Uploaded PDFs are parsed page by page and split into chunks.
27
  Each chunk is stored with metadata (document, page number, chunk index) and embedded into `pgvector`.
28
 
29
  At question time:
30
- 1. Document matching uses keyword scoring + LLM semantic reranking
31
- 2. Relevant chunks are retrieved from matched documents via vector search
32
  3. The agent answers from those chunks when possible
33
  4. If evidence is weak, the agent uses web search and cites external URLs
34
 
@@ -84,12 +84,12 @@ Why I chose this:
84
  - avoids duplicate indexing,
85
  - keeps retrieval secure per user.
86
 
87
- I also implemented a two-stage document matching system:
88
 
89
- - Stage 1: Fast keyword scoring checks exact phrase matches and word-level matches across filename, summary, and preview text with weighted scoring (filename matches score higher than preview matches).
90
- - Stage 2: LLM semantic reranking takes the top scored candidates (up to 8) and reranks them based on semantic similarity to the query.
91
-
92
- This hybrid approach balances speed and accuracy - keyword filtering is fast and catches obvious matches, while the LLM handles nuanced semantic understanding without processing every document.
93
 
94
  ## Challenges I Ran Into
95
 
 
27
  Each chunk is stored with metadata (document, page number, chunk index) and embedded into `pgvector`.
28
 
29
  At question time:
30
+ 1. LLM-based document filtering selects relevant documents from user's library
31
+ 2. Vector search retrieves relevant chunks from selected documents
32
  3. The agent answers from those chunks when possible
33
  4. If evidence is weak, the agent uses web search and cites external URLs
34
 
 
84
  - avoids duplicate indexing,
85
  - keeps retrieval secure per user.
86
 
87
+ I also implemented LLM-based document filtering:
88
 
89
+ - The system sends all user documents (filename, summary, preview) to the LLM
90
+ - LLM semantically analyzes and selects only truly relevant documents for the query
91
+ - Returns 0 to N documents based on actual relevance (not forced to always return the max limit)
92
+ - Fallback returns first N documents if LLM call fails
93
 
94
  ## Challenges I Ran Into
95
 
app/main.py CHANGED
@@ -1,7 +1,7 @@
1
  import re
2
  from typing import Any
3
 
4
- from fastapi import Cookie, Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, status
5
  from fastapi.responses import HTMLResponse, JSONResponse
6
  from fastapi.responses import StreamingResponse
7
  from langchain_core.messages import HumanMessage, ToolMessage
@@ -345,11 +345,19 @@ def ask_question(
345
  db: Session = Depends(get_db),
346
  user: User = Depends(get_current_user),
347
  access_token: str | None = Cookie(default=None),
 
348
  ):
349
  document_service.ensure_page_metadata_for_user(db=db, user=user)
350
  agent = build_agent(db=db, user=user)
351
- session_key = access_token or f"user:{user.id}"
 
 
 
 
 
 
352
  config = {"configurable": {"thread_id": session_key}}
 
353
  previous_messages: list[Any] = []
354
  try:
355
  state = agent.get_state(config)
 
1
  import re
2
  from typing import Any
3
 
4
+ from fastapi import Cookie, Depends, FastAPI, File, Form, Header, HTTPException, Request, UploadFile, status
5
  from fastapi.responses import HTMLResponse, JSONResponse
6
  from fastapi.responses import StreamingResponse
7
  from langchain_core.messages import HumanMessage, ToolMessage
 
345
  db: Session = Depends(get_db),
346
  user: User = Depends(get_current_user),
347
  access_token: str | None = Cookie(default=None),
348
+ x_session_id: str | None = Header(default=None, alias="X-Session-Id"),
349
  ):
350
  document_service.ensure_page_metadata_for_user(db=db, user=user)
351
  agent = build_agent(db=db, user=user)
352
+
353
+ # Use session ID from header if provided, otherwise fall back to access token or user ID
354
+ if x_session_id:
355
+ session_key = f"user:{user.id}:session:{x_session_id}"
356
+ else:
357
+ session_key = access_token or f"user:{user.id}"
358
+
359
  config = {"configurable": {"thread_id": session_key}}
360
+ print(f"[Agent] thread_id: {session_key}")
361
  previous_messages: list[Any] = []
362
  try:
363
  state = agent.get_state(config)
app/services/document_service.py CHANGED
@@ -136,69 +136,19 @@ class DocumentService:
136
  if not docs:
137
  return []
138
 
139
- query_lower = query.lower()
140
- scored: list[tuple[float, str, Document]] = []
141
-
142
- for doc in docs:
143
- score = 0.0
144
-
145
- # Exact phrase matching (highest priority)
146
- if query_lower in (doc.filename or "").lower():
147
- score += 10.0
148
- if query_lower in (doc.summary or "").lower():
149
- score += 5.0
150
- if query_lower in (doc.extracted_preview or "").lower():
151
- score += 2.0
152
-
153
- # Word-level matching
154
- query_words = query_lower.split()
155
- filename_lower = (doc.filename or "").lower()
156
- summary_lower = (doc.summary or "").lower()
157
- preview_lower = (doc.extracted_preview or "").lower()
158
-
159
- for word in query_words:
160
- if len(word) > 2: # Skip very short words
161
- if word in filename_lower:
162
- score += 3.0
163
- if word in summary_lower:
164
- score += 1.5
165
- if word in preview_lower:
166
- score += 0.5
167
-
168
- if score > 0:
169
- scored.append((score, doc.file_hash, doc))
170
 
171
- # Sort by score
172
- scored.sort(reverse=True, key=lambda x: x[0])
173
-
174
- # Take top candidates for LLM (up to 8)
175
- candidates_count = min(max(limit * 2, 8), len(scored)) if scored else min(limit, len(docs))
176
-
177
- if scored:
178
- ranked_docs = [doc for _, _, doc in scored[:candidates_count]]
179
- ranked_hashes = [file_hash for _, file_hash, _ in scored[:candidates_count]]
180
- else:
181
- # No keyword matches, use all docs up to limit
182
- ranked_docs = docs[:candidates_count]
183
- ranked_hashes = [doc.file_hash for doc in ranked_docs]
184
-
185
- # LLM reranking
186
- llm_ranked_hashes = self._llm_verify_document_hashes(query=query, candidates=ranked_docs, limit=limit)
187
-
188
- # Merge: LLM results first, then keyword fallback
189
- merged = llm_ranked_hashes + [h for h in ranked_hashes if h not in llm_ranked_hashes]
190
- return merged[:limit]
191
-
192
-
193
-
194
- def _llm_verify_document_hashes(self, *, query: str, candidates: list[Document], limit: int) -> list[str]:
195
  if not self.settings.groq_api_key or not candidates:
196
  return []
197
  if self.matcher_llm is None:
198
  self.matcher_llm = ChatGroq(api_key=self.settings.groq_api_key, model=self.settings.model_name, temperature=0)
199
 
200
  payload = []
201
- for doc in candidates[:8]:
202
  payload.append(
203
  {
204
  "file_hash": doc.file_hash,
@@ -209,12 +159,15 @@ class DocumentService:
209
  )
210
 
211
  prompt = (
212
- "Rank the most relevant documents for the user query based on semantic similarity.\n"
 
 
 
 
213
  "Return ONLY valid JSON with this exact schema:\n"
214
- '{"file_hashes": ["<hash1>", "<hash2>"]}\n'
215
- f"Return at most {limit} hashes ordered by relevance.\n\n"
216
- f"User query:\n{query}\n\n"
217
- f"Candidates:\n{json.dumps(payload, ensure_ascii=True)}"
218
  )
219
  try:
220
  response = self.matcher_llm.invoke(prompt)
@@ -231,7 +184,8 @@ class DocumentService:
231
  valid = {item.get("file_hash", "") for item in payload}
232
  return [value for value in hashes if isinstance(value, str) and value in valid][:limit]
233
  except Exception:
234
- return []
 
235
 
236
  def ensure_page_metadata_for_user(self, *, db: Session, user: User) -> None:
237
  docs = self.list_user_documents(db, user)
 
136
  if not docs:
137
  return []
138
 
139
+ # Send all documents to LLM for semantic matching
140
+ matched_hashes = self._llm_filter_documents(query=query, candidates=docs, limit=limit)
141
+ print("Documents Matched ----->", matched_hashes)
142
+ return matched_hashes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ def _llm_filter_documents(self, *, query: str, candidates: list[Document], limit: int) -> list[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  if not self.settings.groq_api_key or not candidates:
146
  return []
147
  if self.matcher_llm is None:
148
  self.matcher_llm = ChatGroq(api_key=self.settings.groq_api_key, model=self.settings.model_name, temperature=0)
149
 
150
  payload = []
151
+ for doc in candidates:
152
  payload.append(
153
  {
154
  "file_hash": doc.file_hash,
 
159
  )
160
 
161
  prompt = (
162
+ "You are a document relevance filter. Analyze the user query and select ONLY the truly relevant documents.\n"
163
+ "Consider semantic similarity, topic alignment, and document purpose.\n\n"
164
+ "IMPORTANT: Only include documents that are actually relevant to answering the query.\n"
165
+ "It's better to return fewer relevant documents than to include irrelevant ones.\n"
166
+ f"You may return anywhere from 0 to {limit} documents.\n\n"
167
  "Return ONLY valid JSON with this exact schema:\n"
168
+ '{"file_hashes": ["<hash1>", "<hash2>", ...]}\n\n'
169
+ f"User query: {query}\n\n"
170
+ f"Available documents:\n{json.dumps(payload, ensure_ascii=True, indent=2)}"
 
171
  )
172
  try:
173
  response = self.matcher_llm.invoke(prompt)
 
184
  valid = {item.get("file_hash", "") for item in payload}
185
  return [value for value in hashes if isinstance(value, str) and value in valid][:limit]
186
  except Exception:
187
+ # Fallback: return first N documents
188
+ return [doc.file_hash for doc in candidates[:limit]]
189
 
190
  def ensure_page_metadata_for_user(self, *, db: Session, user: User) -> None:
191
  docs = self.list_user_documents(db, user)
app/templates/index.html CHANGED
@@ -118,7 +118,10 @@
118
  <section class="card panel chat-shell chat-panel">
119
  <div class="panel-head panel-head-inline">
120
  <h2>DocsQA Chat</h2>
121
- <span class="badge">Markdown enabled</span>
 
 
 
122
  </div>
123
  <div id="chat-thread" class="chat-thread">
124
  <article class="chat-msg assistant">
@@ -142,6 +145,10 @@
142
  </main>
143
 
144
  <script>
 
 
 
 
145
  const registerForm = document.getElementById("register-form");
146
  const loginForm = document.getElementById("login-form");
147
  const logoutForm = document.getElementById("logout-form");
@@ -153,6 +160,22 @@
153
  const chatThread = document.getElementById("chat-thread");
154
  const queryInput = document.getElementById("query");
155
  const docDeleteButtons = document.querySelectorAll(".doc-delete-btn");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  const safeJson = async (response) => {
158
  try {
@@ -542,7 +565,10 @@
542
 
543
  const response = await fetch("/ask", {
544
  method: "POST",
545
- headers: { "Content-Type": "application/json" },
 
 
 
546
  body: JSON.stringify({ query }),
547
  });
548
  const body = await safeJson(response);
 
118
  <section class="card panel chat-shell chat-panel">
119
  <div class="panel-head panel-head-inline">
120
  <h2>DocsQA Chat</h2>
121
+ <div style="display: flex; gap: 8px; align-items: center;">
122
+ <button type="button" id="new-chat-btn" class="secondary" style="font-size: 0.875rem; padding: 6px 12px;">New Chat</button>
123
+ <span class="badge">Markdown enabled</span>
124
+ </div>
125
  </div>
126
  <div id="chat-thread" class="chat-thread">
127
  <article class="chat-msg assistant">
 
145
  </main>
146
 
147
  <script>
148
+ // Session management
149
+ let currentSessionId = sessionStorage.getItem("chat_session_id") || `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
150
+ sessionStorage.setItem("chat_session_id", currentSessionId);
151
+
152
  const registerForm = document.getElementById("register-form");
153
  const loginForm = document.getElementById("login-form");
154
  const logoutForm = document.getElementById("logout-form");
 
160
  const chatThread = document.getElementById("chat-thread");
161
  const queryInput = document.getElementById("query");
162
  const docDeleteButtons = document.querySelectorAll(".doc-delete-btn");
163
+ const newChatBtn = document.getElementById("new-chat-btn");
164
+
165
+ // New Chat button handler
166
+ newChatBtn?.addEventListener("click", () => {
167
+ currentSessionId = `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
168
+ sessionStorage.setItem("chat_session_id", currentSessionId);
169
+ if (chatThread) {
170
+ chatThread.innerHTML = `
171
+ <article class="chat-msg assistant">
172
+ <div class="chat-bubble chat-bubble-assistant chat-markdown">
173
+ <p>Ask anything about your uploaded PDFs and I will answer with citations from retrieved chunks.</p>
174
+ </div>
175
+ </article>
176
+ `;
177
+ }
178
+ });
179
 
180
  const safeJson = async (response) => {
181
  try {
 
565
 
566
  const response = await fetch("/ask", {
567
  method: "POST",
568
+ headers: {
569
+ "Content-Type": "application/json",
570
+ "X-Session-Id": currentSessionId
571
+ },
572
  body: JSON.stringify({ query }),
573
  });
574
  const body = await safeJson(response);