feat(document-matching): simplify to LLM-based document filtering
Browse files- Replace two-stage keyword + LLM reranking with pure LLM semantic filtering
- Update document matching to send all user documents to LLM for analysis
- LLM now selects 0 to N relevant documents based on actual relevance instead of forced limits
- Rename _llm_verify_document_hashes to _llm_filter_documents for clarity
- Add X-Session-Id header support for improved session tracking in ask_question endpoint
- Update README to reflect simplified document filtering approach
- Add session ID logging for debugging agent thread IDs
- Improve LLM prompt to emphasize quality over quantity in document selection
- README.md +7 -7
- app/main.py +10 -2
- app/services/document_service.py +16 -62
- app/templates/index.html +28 -2
README.md
CHANGED
|
@@ -27,8 +27,8 @@ Uploaded PDFs are parsed page by page and split into chunks.
|
|
| 27 |
Each chunk is stored with metadata (document, page number, chunk index) and embedded into `pgvector`.
|
| 28 |
|
| 29 |
At question time:
|
| 30 |
-
1.
|
| 31 |
-
2.
|
| 32 |
3. The agent answers from those chunks when possible
|
| 33 |
4. If evidence is weak, the agent uses web search and cites external URLs
|
| 34 |
|
|
@@ -84,12 +84,12 @@ Why I chose this:
|
|
| 84 |
- avoids duplicate indexing,
|
| 85 |
- keeps retrieval secure per user.
|
| 86 |
|
| 87 |
-
I also implemented
|
| 88 |
|
| 89 |
-
-
|
| 90 |
-
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
|
| 94 |
## Challenges I Ran Into
|
| 95 |
|
|
|
|
| 27 |
Each chunk is stored with metadata (document, page number, chunk index) and embedded into `pgvector`.
|
| 28 |
|
| 29 |
At question time:
|
| 30 |
+
1. LLM-based document filtering selects relevant documents from user's library
|
| 31 |
+
2. Vector search retrieves relevant chunks from selected documents
|
| 32 |
3. The agent answers from those chunks when possible
|
| 33 |
4. If evidence is weak, the agent uses web search and cites external URLs
|
| 34 |
|
|
|
|
| 84 |
- avoids duplicate indexing,
|
| 85 |
- keeps retrieval secure per user.
|
| 86 |
|
| 87 |
+
I also implemented LLM-based document filtering:
|
| 88 |
|
| 89 |
+
- The system sends all user documents (filename, summary, preview) to the LLM
|
| 90 |
+
- LLM semantically analyzes and selects only truly relevant documents for the query
|
| 91 |
+
- Returns 0 to N documents based on actual relevance (not forced to always return the max limit)
|
| 92 |
+
- Fallback returns first N documents if LLM call fails
|
| 93 |
|
| 94 |
## Challenges I Ran Into
|
| 95 |
|
app/main.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
from typing import Any
|
| 3 |
|
| 4 |
-
from fastapi import Cookie, Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, status
|
| 5 |
from fastapi.responses import HTMLResponse, JSONResponse
|
| 6 |
from fastapi.responses import StreamingResponse
|
| 7 |
from langchain_core.messages import HumanMessage, ToolMessage
|
|
@@ -345,11 +345,19 @@ def ask_question(
|
|
| 345 |
db: Session = Depends(get_db),
|
| 346 |
user: User = Depends(get_current_user),
|
| 347 |
access_token: str | None = Cookie(default=None),
|
|
|
|
| 348 |
):
|
| 349 |
document_service.ensure_page_metadata_for_user(db=db, user=user)
|
| 350 |
agent = build_agent(db=db, user=user)
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
config = {"configurable": {"thread_id": session_key}}
|
|
|
|
| 353 |
previous_messages: list[Any] = []
|
| 354 |
try:
|
| 355 |
state = agent.get_state(config)
|
|
|
|
| 1 |
import re
|
| 2 |
from typing import Any
|
| 3 |
|
| 4 |
+
from fastapi import Cookie, Depends, FastAPI, File, Form, Header, HTTPException, Request, UploadFile, status
|
| 5 |
from fastapi.responses import HTMLResponse, JSONResponse
|
| 6 |
from fastapi.responses import StreamingResponse
|
| 7 |
from langchain_core.messages import HumanMessage, ToolMessage
|
|
|
|
| 345 |
db: Session = Depends(get_db),
|
| 346 |
user: User = Depends(get_current_user),
|
| 347 |
access_token: str | None = Cookie(default=None),
|
| 348 |
+
x_session_id: str | None = Header(default=None, alias="X-Session-Id"),
|
| 349 |
):
|
| 350 |
document_service.ensure_page_metadata_for_user(db=db, user=user)
|
| 351 |
agent = build_agent(db=db, user=user)
|
| 352 |
+
|
| 353 |
+
# Use session ID from header if provided, otherwise fall back to access token or user ID
|
| 354 |
+
if x_session_id:
|
| 355 |
+
session_key = f"user:{user.id}:session:{x_session_id}"
|
| 356 |
+
else:
|
| 357 |
+
session_key = access_token or f"user:{user.id}"
|
| 358 |
+
|
| 359 |
config = {"configurable": {"thread_id": session_key}}
|
| 360 |
+
print(f"[Agent] thread_id: {session_key}")
|
| 361 |
previous_messages: list[Any] = []
|
| 362 |
try:
|
| 363 |
state = agent.get_state(config)
|
app/services/document_service.py
CHANGED
|
@@ -136,69 +136,19 @@ class DocumentService:
|
|
| 136 |
if not docs:
|
| 137 |
return []
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
score = 0.0
|
| 144 |
-
|
| 145 |
-
# Exact phrase matching (highest priority)
|
| 146 |
-
if query_lower in (doc.filename or "").lower():
|
| 147 |
-
score += 10.0
|
| 148 |
-
if query_lower in (doc.summary or "").lower():
|
| 149 |
-
score += 5.0
|
| 150 |
-
if query_lower in (doc.extracted_preview or "").lower():
|
| 151 |
-
score += 2.0
|
| 152 |
-
|
| 153 |
-
# Word-level matching
|
| 154 |
-
query_words = query_lower.split()
|
| 155 |
-
filename_lower = (doc.filename or "").lower()
|
| 156 |
-
summary_lower = (doc.summary or "").lower()
|
| 157 |
-
preview_lower = (doc.extracted_preview or "").lower()
|
| 158 |
-
|
| 159 |
-
for word in query_words:
|
| 160 |
-
if len(word) > 2: # Skip very short words
|
| 161 |
-
if word in filename_lower:
|
| 162 |
-
score += 3.0
|
| 163 |
-
if word in summary_lower:
|
| 164 |
-
score += 1.5
|
| 165 |
-
if word in preview_lower:
|
| 166 |
-
score += 0.5
|
| 167 |
-
|
| 168 |
-
if score > 0:
|
| 169 |
-
scored.append((score, doc.file_hash, doc))
|
| 170 |
|
| 171 |
-
|
| 172 |
-
scored.sort(reverse=True, key=lambda x: x[0])
|
| 173 |
-
|
| 174 |
-
# Take top candidates for LLM (up to 8)
|
| 175 |
-
candidates_count = min(max(limit * 2, 8), len(scored)) if scored else min(limit, len(docs))
|
| 176 |
-
|
| 177 |
-
if scored:
|
| 178 |
-
ranked_docs = [doc for _, _, doc in scored[:candidates_count]]
|
| 179 |
-
ranked_hashes = [file_hash for _, file_hash, _ in scored[:candidates_count]]
|
| 180 |
-
else:
|
| 181 |
-
# No keyword matches, use all docs up to limit
|
| 182 |
-
ranked_docs = docs[:candidates_count]
|
| 183 |
-
ranked_hashes = [doc.file_hash for doc in ranked_docs]
|
| 184 |
-
|
| 185 |
-
# LLM reranking
|
| 186 |
-
llm_ranked_hashes = self._llm_verify_document_hashes(query=query, candidates=ranked_docs, limit=limit)
|
| 187 |
-
|
| 188 |
-
# Merge: LLM results first, then keyword fallback
|
| 189 |
-
merged = llm_ranked_hashes + [h for h in ranked_hashes if h not in llm_ranked_hashes]
|
| 190 |
-
return merged[:limit]
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
def _llm_verify_document_hashes(self, *, query: str, candidates: list[Document], limit: int) -> list[str]:
|
| 195 |
if not self.settings.groq_api_key or not candidates:
|
| 196 |
return []
|
| 197 |
if self.matcher_llm is None:
|
| 198 |
self.matcher_llm = ChatGroq(api_key=self.settings.groq_api_key, model=self.settings.model_name, temperature=0)
|
| 199 |
|
| 200 |
payload = []
|
| 201 |
-
for doc in candidates
|
| 202 |
payload.append(
|
| 203 |
{
|
| 204 |
"file_hash": doc.file_hash,
|
|
@@ -209,12 +159,15 @@ class DocumentService:
|
|
| 209 |
)
|
| 210 |
|
| 211 |
prompt = (
|
| 212 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
"Return ONLY valid JSON with this exact schema:\n"
|
| 214 |
-
'{"file_hashes": ["<hash1>", "<hash2>"]}\n'
|
| 215 |
-
f"
|
| 216 |
-
f"
|
| 217 |
-
f"Candidates:\n{json.dumps(payload, ensure_ascii=True)}"
|
| 218 |
)
|
| 219 |
try:
|
| 220 |
response = self.matcher_llm.invoke(prompt)
|
|
@@ -231,7 +184,8 @@ class DocumentService:
|
|
| 231 |
valid = {item.get("file_hash", "") for item in payload}
|
| 232 |
return [value for value in hashes if isinstance(value, str) and value in valid][:limit]
|
| 233 |
except Exception:
|
| 234 |
-
return
|
|
|
|
| 235 |
|
| 236 |
def ensure_page_metadata_for_user(self, *, db: Session, user: User) -> None:
|
| 237 |
docs = self.list_user_documents(db, user)
|
|
|
|
| 136 |
if not docs:
|
| 137 |
return []
|
| 138 |
|
| 139 |
+
# Send all documents to LLM for semantic matching
|
| 140 |
+
matched_hashes = self._llm_filter_documents(query=query, candidates=docs, limit=limit)
|
| 141 |
+
print("Documents Matched ----->", matched_hashes)
|
| 142 |
+
return matched_hashes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
+
def _llm_filter_documents(self, *, query: str, candidates: list[Document], limit: int) -> list[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
if not self.settings.groq_api_key or not candidates:
|
| 146 |
return []
|
| 147 |
if self.matcher_llm is None:
|
| 148 |
self.matcher_llm = ChatGroq(api_key=self.settings.groq_api_key, model=self.settings.model_name, temperature=0)
|
| 149 |
|
| 150 |
payload = []
|
| 151 |
+
for doc in candidates:
|
| 152 |
payload.append(
|
| 153 |
{
|
| 154 |
"file_hash": doc.file_hash,
|
|
|
|
| 159 |
)
|
| 160 |
|
| 161 |
prompt = (
|
| 162 |
+
"You are a document relevance filter. Analyze the user query and select ONLY the truly relevant documents.\n"
|
| 163 |
+
"Consider semantic similarity, topic alignment, and document purpose.\n\n"
|
| 164 |
+
"IMPORTANT: Only include documents that are actually relevant to answering the query.\n"
|
| 165 |
+
"It's better to return fewer relevant documents than to include irrelevant ones.\n"
|
| 166 |
+
f"You may return anywhere from 0 to {limit} documents.\n\n"
|
| 167 |
"Return ONLY valid JSON with this exact schema:\n"
|
| 168 |
+
'{"file_hashes": ["<hash1>", "<hash2>", ...]}\n\n'
|
| 169 |
+
f"User query: {query}\n\n"
|
| 170 |
+
f"Available documents:\n{json.dumps(payload, ensure_ascii=True, indent=2)}"
|
|
|
|
| 171 |
)
|
| 172 |
try:
|
| 173 |
response = self.matcher_llm.invoke(prompt)
|
|
|
|
| 184 |
valid = {item.get("file_hash", "") for item in payload}
|
| 185 |
return [value for value in hashes if isinstance(value, str) and value in valid][:limit]
|
| 186 |
except Exception:
|
| 187 |
+
# Fallback: return first N documents
|
| 188 |
+
return [doc.file_hash for doc in candidates[:limit]]
|
| 189 |
|
| 190 |
def ensure_page_metadata_for_user(self, *, db: Session, user: User) -> None:
|
| 191 |
docs = self.list_user_documents(db, user)
|
app/templates/index.html
CHANGED
|
@@ -118,7 +118,10 @@
|
|
| 118 |
<section class="card panel chat-shell chat-panel">
|
| 119 |
<div class="panel-head panel-head-inline">
|
| 120 |
<h2>DocsQA Chat</h2>
|
| 121 |
-
<
|
|
|
|
|
|
|
|
|
|
| 122 |
</div>
|
| 123 |
<div id="chat-thread" class="chat-thread">
|
| 124 |
<article class="chat-msg assistant">
|
|
@@ -142,6 +145,10 @@
|
|
| 142 |
</main>
|
| 143 |
|
| 144 |
<script>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
const registerForm = document.getElementById("register-form");
|
| 146 |
const loginForm = document.getElementById("login-form");
|
| 147 |
const logoutForm = document.getElementById("logout-form");
|
|
@@ -153,6 +160,22 @@
|
|
| 153 |
const chatThread = document.getElementById("chat-thread");
|
| 154 |
const queryInput = document.getElementById("query");
|
| 155 |
const docDeleteButtons = document.querySelectorAll(".doc-delete-btn");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
const safeJson = async (response) => {
|
| 158 |
try {
|
|
@@ -542,7 +565,10 @@
|
|
| 542 |
|
| 543 |
const response = await fetch("/ask", {
|
| 544 |
method: "POST",
|
| 545 |
-
headers: {
|
|
|
|
|
|
|
|
|
|
| 546 |
body: JSON.stringify({ query }),
|
| 547 |
});
|
| 548 |
const body = await safeJson(response);
|
|
|
|
| 118 |
<section class="card panel chat-shell chat-panel">
|
| 119 |
<div class="panel-head panel-head-inline">
|
| 120 |
<h2>DocsQA Chat</h2>
|
| 121 |
+
<div style="display: flex; gap: 8px; align-items: center;">
|
| 122 |
+
<button type="button" id="new-chat-btn" class="secondary" style="font-size: 0.875rem; padding: 6px 12px;">New Chat</button>
|
| 123 |
+
<span class="badge">Markdown enabled</span>
|
| 124 |
+
</div>
|
| 125 |
</div>
|
| 126 |
<div id="chat-thread" class="chat-thread">
|
| 127 |
<article class="chat-msg assistant">
|
|
|
|
| 145 |
</main>
|
| 146 |
|
| 147 |
<script>
|
| 148 |
+
// Session management
|
| 149 |
+
let currentSessionId = sessionStorage.getItem("chat_session_id") || `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
| 150 |
+
sessionStorage.setItem("chat_session_id", currentSessionId);
|
| 151 |
+
|
| 152 |
const registerForm = document.getElementById("register-form");
|
| 153 |
const loginForm = document.getElementById("login-form");
|
| 154 |
const logoutForm = document.getElementById("logout-form");
|
|
|
|
| 160 |
const chatThread = document.getElementById("chat-thread");
|
| 161 |
const queryInput = document.getElementById("query");
|
| 162 |
const docDeleteButtons = document.querySelectorAll(".doc-delete-btn");
|
| 163 |
+
const newChatBtn = document.getElementById("new-chat-btn");
|
| 164 |
+
|
| 165 |
+
// New Chat button handler
|
| 166 |
+
newChatBtn?.addEventListener("click", () => {
|
| 167 |
+
currentSessionId = `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
| 168 |
+
sessionStorage.setItem("chat_session_id", currentSessionId);
|
| 169 |
+
if (chatThread) {
|
| 170 |
+
chatThread.innerHTML = `
|
| 171 |
+
<article class="chat-msg assistant">
|
| 172 |
+
<div class="chat-bubble chat-bubble-assistant chat-markdown">
|
| 173 |
+
<p>Ask anything about your uploaded PDFs and I will answer with citations from retrieved chunks.</p>
|
| 174 |
+
</div>
|
| 175 |
+
</article>
|
| 176 |
+
`;
|
| 177 |
+
}
|
| 178 |
+
});
|
| 179 |
|
| 180 |
const safeJson = async (response) => {
|
| 181 |
try {
|
|
|
|
| 565 |
|
| 566 |
const response = await fetch("/ask", {
|
| 567 |
method: "POST",
|
| 568 |
+
headers: {
|
| 569 |
+
"Content-Type": "application/json",
|
| 570 |
+
"X-Session-Id": currentSessionId
|
| 571 |
+
},
|
| 572 |
body: JSON.stringify({ query }),
|
| 573 |
});
|
| 574 |
const body = await safeJson(response);
|