Spaces:

KBaba7
/

DocsBot

Sleeping

BabaK07 commited on 20 days ago

Commit

aefb7b1

1 Parent(s): 62d2116

feat(document-matching): simplify to LLM-based document filtering

- Replace two-stage keyword + LLM reranking with pure LLM semantic filtering
- Update document matching to send all user documents to LLM for analysis
- LLM now selects 0 to N relevant documents based on actual relevance instead of forced limits
- Rename _llm_verify_document_hashes to _llm_filter_documents for clarity
- Add X-Session-Id header support for improved session tracking in ask_question endpoint
- Update README to reflect simplified document filtering approach
- Add session ID logging for debugging agent thread IDs
- Improve LLM prompt to emphasize quality over quantity in document selection

Files changed (4) hide show

README.md +7 -7
app/main.py +10 -2
app/services/document_service.py +16 -62
app/templates/index.html +28 -2

README.md CHANGED Viewed

@@ -27,8 +27,8 @@ Uploaded PDFs are parsed page by page and split into chunks.
 Each chunk is stored with metadata (document, page number, chunk index) and embedded into `pgvector`.
 At question time:
-1. Document matching uses keyword scoring + LLM semantic reranking
-2. Relevant chunks are retrieved from matched documents via vector search
 3. The agent answers from those chunks when possible
 4. If evidence is weak, the agent uses web search and cites external URLs
@@ -84,12 +84,12 @@ Why I chose this:
 - avoids duplicate indexing,
 - keeps retrieval secure per user.
-I also implemented a two-stage document matching system:
-- Stage 1: Fast keyword scoring checks exact phrase matches and word-level matches across filename, summary, and preview text with weighted scoring (filename matches score higher than preview matches).
-- Stage 2: LLM semantic reranking takes the top scored candidates (up to 8) and reranks them based on semantic similarity to the query.
-This hybrid approach balances speed and accuracy - keyword filtering is fast and catches obvious matches, while the LLM handles nuanced semantic understanding without processing every document.
 ## Challenges I Ran Into

 Each chunk is stored with metadata (document, page number, chunk index) and embedded into `pgvector`.
 At question time:
+1. LLM-based document filtering selects relevant documents from user's library
+2. Vector search retrieves relevant chunks from selected documents
 3. The agent answers from those chunks when possible
 4. If evidence is weak, the agent uses web search and cites external URLs
 - avoids duplicate indexing,
 - keeps retrieval secure per user.
+I also implemented LLM-based document filtering:
+- The system sends all user documents (filename, summary, preview) to the LLM
+- LLM semantically analyzes and selects only truly relevant documents for the query
+- Returns 0 to N documents based on actual relevance (not forced to always return the max limit)
+- Fallback returns first N documents if LLM call fails
 ## Challenges I Ran Into

app/main.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 from typing import Any
-from fastapi import Cookie, Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, status
 from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.responses import StreamingResponse
 from langchain_core.messages import HumanMessage, ToolMessage
@@ -345,11 +345,19 @@ def ask_question(
     db: Session = Depends(get_db),
     user: User = Depends(get_current_user),
     access_token: str | None = Cookie(default=None),
 ):
     document_service.ensure_page_metadata_for_user(db=db, user=user)
     agent = build_agent(db=db, user=user)
-    session_key = access_token or f"user:{user.id}"
     config = {"configurable": {"thread_id": session_key}}
     previous_messages: list[Any] = []
     try:
         state = agent.get_state(config)

 import re
 from typing import Any
+from fastapi import Cookie, Depends, FastAPI, File, Form, Header, HTTPException, Request, UploadFile, status
 from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.responses import StreamingResponse
 from langchain_core.messages import HumanMessage, ToolMessage
     db: Session = Depends(get_db),
     user: User = Depends(get_current_user),
     access_token: str | None = Cookie(default=None),
+    x_session_id: str | None = Header(default=None, alias="X-Session-Id"),
 ):
     document_service.ensure_page_metadata_for_user(db=db, user=user)
     agent = build_agent(db=db, user=user)
+    # Use session ID from header if provided, otherwise fall back to access token or user ID
+    if x_session_id:
+        session_key = f"user:{user.id}:session:{x_session_id}"
+    else:
+        session_key = access_token or f"user:{user.id}"
     config = {"configurable": {"thread_id": session_key}}
+    print(f"[Agent] thread_id: {session_key}")
     previous_messages: list[Any] = []
     try:
         state = agent.get_state(config)

app/services/document_service.py CHANGED Viewed

@@ -136,69 +136,19 @@ class DocumentService:
         if not docs:
             return []
-        query_lower = query.lower()
-        scored: list[tuple[float, str, Document]] = []
-        for doc in docs:
-            score = 0.0
-            # Exact phrase matching (highest priority)
-            if query_lower in (doc.filename or "").lower():
-                score += 10.0
-            if query_lower in (doc.summary or "").lower():
-                score += 5.0
-            if query_lower in (doc.extracted_preview or "").lower():
-                score += 2.0
-            # Word-level matching
-            query_words = query_lower.split()
-            filename_lower = (doc.filename or "").lower()
-            summary_lower = (doc.summary or "").lower()
-            preview_lower = (doc.extracted_preview or "").lower()
-            for word in query_words:
-                if len(word) > 2:  # Skip very short words
-                    if word in filename_lower:
-                        score += 3.0
-                    if word in summary_lower:
-                        score += 1.5
-                    if word in preview_lower:
-                        score += 0.5
-            if score > 0:
-                scored.append((score, doc.file_hash, doc))
-        # Sort by score
-        scored.sort(reverse=True, key=lambda x: x[0])
-        # Take top candidates for LLM (up to 8)
-        candidates_count = min(max(limit * 2, 8), len(scored)) if scored else min(limit, len(docs))
-        if scored:
-            ranked_docs = [doc for _, _, doc in scored[:candidates_count]]
-            ranked_hashes = [file_hash for _, file_hash, _ in scored[:candidates_count]]
-        else:
-            # No keyword matches, use all docs up to limit
-            ranked_docs = docs[:candidates_count]
-            ranked_hashes = [doc.file_hash for doc in ranked_docs]
-        # LLM reranking
-        llm_ranked_hashes = self._llm_verify_document_hashes(query=query, candidates=ranked_docs, limit=limit)
-        # Merge: LLM results first, then keyword fallback
-        merged = llm_ranked_hashes + [h for h in ranked_hashes if h not in llm_ranked_hashes]
-        return merged[:limit]
-    def _llm_verify_document_hashes(self, *, query: str, candidates: list[Document], limit: int) -> list[str]:
         if not self.settings.groq_api_key or not candidates:
             return []
         if self.matcher_llm is None:
             self.matcher_llm = ChatGroq(api_key=self.settings.groq_api_key, model=self.settings.model_name, temperature=0)
         payload = []
-        for doc in candidates[:8]:
             payload.append(
                 {
                     "file_hash": doc.file_hash,
@@ -209,12 +159,15 @@ class DocumentService:
             )
         prompt = (
-            "Rank the most relevant documents for the user query based on semantic similarity.\n"
             "Return ONLY valid JSON with this exact schema:\n"
-            '{"file_hashes": ["<hash1>", "<hash2>"]}\n'
-            f"Return at most {limit} hashes ordered by relevance.\n\n"
-            f"User query:\n{query}\n\n"
-            f"Candidates:\n{json.dumps(payload, ensure_ascii=True)}"
         )
         try:
             response = self.matcher_llm.invoke(prompt)
@@ -231,7 +184,8 @@ class DocumentService:
             valid = {item.get("file_hash", "") for item in payload}
             return [value for value in hashes if isinstance(value, str) and value in valid][:limit]
         except Exception:
-            return []
     def ensure_page_metadata_for_user(self, *, db: Session, user: User) -> None:
         docs = self.list_user_documents(db, user)

         if not docs:
             return []
+        # Send all documents to LLM for semantic matching
+        matched_hashes = self._llm_filter_documents(query=query, candidates=docs, limit=limit)
+        print("Documents Matched ----->", matched_hashes)
+        return matched_hashes
+    def _llm_filter_documents(self, *, query: str, candidates: list[Document], limit: int) -> list[str]:
         if not self.settings.groq_api_key or not candidates:
             return []
         if self.matcher_llm is None:
             self.matcher_llm = ChatGroq(api_key=self.settings.groq_api_key, model=self.settings.model_name, temperature=0)
         payload = []
+        for doc in candidates:
             payload.append(
                 {
                     "file_hash": doc.file_hash,
             )
         prompt = (
+            "You are a document relevance filter. Analyze the user query and select ONLY the truly relevant documents.\n"
+            "Consider semantic similarity, topic alignment, and document purpose.\n\n"
+            "IMPORTANT: Only include documents that are actually relevant to answering the query.\n"
+            "It's better to return fewer relevant documents than to include irrelevant ones.\n"
+            f"You may return anywhere from 0 to {limit} documents.\n\n"
             "Return ONLY valid JSON with this exact schema:\n"
+            '{"file_hashes": ["<hash1>", "<hash2>", ...]}\n\n'
+            f"User query: {query}\n\n"
+            f"Available documents:\n{json.dumps(payload, ensure_ascii=True, indent=2)}"
         )
         try:
             response = self.matcher_llm.invoke(prompt)
             valid = {item.get("file_hash", "") for item in payload}
             return [value for value in hashes if isinstance(value, str) and value in valid][:limit]
         except Exception:
+            # Fallback: return first N documents
+            return [doc.file_hash for doc in candidates[:limit]]
     def ensure_page_metadata_for_user(self, *, db: Session, user: User) -> None:
         docs = self.list_user_documents(db, user)

app/templates/index.html CHANGED Viewed

@@ -118,7 +118,10 @@
         <section class="card panel chat-shell chat-panel">
           <div class="panel-head panel-head-inline">
             <h2>DocsQA Chat</h2>
-            <span class="badge">Markdown enabled</span>
           </div>
           <div id="chat-thread" class="chat-thread">
             <article class="chat-msg assistant">
@@ -142,6 +145,10 @@
     </main>
     <script>
       const registerForm = document.getElementById("register-form");
       const loginForm = document.getElementById("login-form");
       const logoutForm = document.getElementById("logout-form");
@@ -153,6 +160,22 @@
       const chatThread = document.getElementById("chat-thread");
       const queryInput = document.getElementById("query");
       const docDeleteButtons = document.querySelectorAll(".doc-delete-btn");
       const safeJson = async (response) => {
         try {
@@ -542,7 +565,10 @@
         const response = await fetch("/ask", {
           method: "POST",
-          headers: { "Content-Type": "application/json" },
           body: JSON.stringify({ query }),
         });
         const body = await safeJson(response);

         <section class="card panel chat-shell chat-panel">
           <div class="panel-head panel-head-inline">
             <h2>DocsQA Chat</h2>
+            <div style="display: flex; gap: 8px; align-items: center;">
+              <button type="button" id="new-chat-btn" class="secondary" style="font-size: 0.875rem; padding: 6px 12px;">New Chat</button>
+              <span class="badge">Markdown enabled</span>
+            </div>
           </div>
           <div id="chat-thread" class="chat-thread">
             <article class="chat-msg assistant">
     </main>
     <script>
+      // Session management
+      let currentSessionId = sessionStorage.getItem("chat_session_id") || `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
+      sessionStorage.setItem("chat_session_id", currentSessionId);
       const registerForm = document.getElementById("register-form");
       const loginForm = document.getElementById("login-form");
       const logoutForm = document.getElementById("logout-form");
       const chatThread = document.getElementById("chat-thread");
       const queryInput = document.getElementById("query");
       const docDeleteButtons = document.querySelectorAll(".doc-delete-btn");
+      const newChatBtn = document.getElementById("new-chat-btn");
+      // New Chat button handler
+      newChatBtn?.addEventListener("click", () => {
+        currentSessionId = `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
+        sessionStorage.setItem("chat_session_id", currentSessionId);
+        if (chatThread) {
+          chatThread.innerHTML = `
+            <article class="chat-msg assistant">
+              <div class="chat-bubble chat-bubble-assistant chat-markdown">
+                <p>Ask anything about your uploaded PDFs and I will answer with citations from retrieved chunks.</p>
+              </div>
+            </article>
+          `;
+        }
+      });
       const safeJson = async (response) => {
         try {
         const response = await fetch("/ask", {
           method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+            "X-Session-Id": currentSessionId
+          },
           body: JSON.stringify({ query }),
         });
         const body = await safeJson(response);