Spaces:

anhkhoiphan
/

092_agent_api

Sleeping

App Files Files Community

anhkhoiphan commited on 9 days ago

Commit

dd53ab9

1 Parent(s): a5c91ab

Thay đổi logic tool RAG

Browse files

Files changed (3) hide show

pdf_rag.py +172 -64
prompts.py +9 -0
tools/chat_tools.py +3 -0

pdf_rag.py CHANGED Viewed

@@ -1,26 +1,31 @@
 """
-PDF RAG — chunk, embed (OpenAI), index to Qdrant, hybrid search.
-Hybrid search = dense (semantic, OpenAI embeddings)
-              + sparse/keyword (Qdrant full-text index)
-              merged via Reciprocal Rank Fusion (RRF).
 """
 import logging
 import uuid
 from typing import Optional
 from openai import OpenAI
 from qdrant_client import QdrantClient
 from qdrant_client.models import (
     Distance,
     FieldCondition,
     Filter,
-    MatchText,
     MatchValue,
     PointStruct,
-    TextIndexParams,
-    TokenizerType,
     VectorParams,
 )
@@ -29,21 +34,25 @@ from src.pdf_processing import pdf_to_markdown
 logger = logging.getLogger(__name__)
-_PDF_COLLECTION = "pdf_chunks"
-_EMBED_MODEL    = "text-embedding-3-small"
-_EMBED_DIMS     = 1536
-_CHUNK_SIZE     = 800   # ký tự / chunk
-_CHUNK_OVERLAP  = 150   # ký tự overlap giữa các chunk
-_EMBED_BATCH    = 32    # số chunk embed song song mỗi lần
-_RRF_K = 60  # hằng số RRF (60 là giá trị chuẩn trong tài liệu)
-_qdrant: Optional[QdrantClient] = None
-_openai: Optional[OpenAI]       = None
-# ── Client helpers ────────────────────────────────────────────────────────────
 def _get_qdrant() -> QdrantClient:
     global _qdrant
@@ -64,31 +73,44 @@ def _get_openai() -> OpenAI:
     return _openai
 def _ensure_collection(client: QdrantClient) -> None:
     existing = {c.name for c in client.get_collections().collections}
     if _PDF_COLLECTION not in existing:
         client.create_collection(
             collection_name=_PDF_COLLECTION,
-            vectors_config=VectorParams(size=_EMBED_DIMS, distance=Distance.COSINE),
         )
-        # Full-text index cho keyword search
         client.create_payload_index(
             collection_name=_PDF_COLLECTION,
-            field_name="chunk_text",
-            field_schema=TextIndexParams(
-                type="text",
-                tokenizer=TokenizerType.MULTILINGUAL,
-            ),
         )
-        logger.info("Qdrant: collection '%s' created.", _PDF_COLLECTION)
 # ── Chunking ──────────────────────────────────────────────────────────────────
 def _chunk_text(text: str) -> list[str]:
-    """
-    Chia text thành các chunk có overlap, ưu tiên cắt tại ranh giới câu.
-    """
     if len(text) <= _CHUNK_SIZE:
         return [text.strip()] if text.strip() else []
@@ -98,7 +120,6 @@ def _chunk_text(text: str) -> list[str]:
         end = min(start + _CHUNK_SIZE, len(text))
         if end < len(text):
-            # Tìm ranh giới câu gần nhất để cắt gọn
             for boundary in ('\n\n', '\n', '.', '!', '?'):
                 pos = text.rfind(boundary, start + _CHUNK_SIZE // 2, end)
                 if pos != -1:
@@ -109,9 +130,10 @@ def _chunk_text(text: str) -> list[str]:
         if chunk:
             chunks.append(chunk)
         next_start = end - _CHUNK_OVERLAP
         if next_start <= start:
-            next_start = end          # tránh vòng lặp vô tận
         start = next_start
     return chunks
@@ -128,12 +150,91 @@ def _embed_one(text: str) -> list[float]:
     return _embed_batch([text])[0]
 # ── Public API ────────────────────────────────────────────────────────────────
 def index_pdf(pdf_path: str, pdf_name: str, conversation_id: str) -> int:
     """
-    Đọc PDF, chunk, embed và index lên Qdrant.
-    Dùng UUID v5 làm point ID để upsert idempotent (re-send cùng file không tạo duplicate).
     Returns:
         Số chunk đã index.
@@ -148,8 +249,9 @@ def index_pdf(pdf_path: str, pdf_name: str, conversation_id: str) -> int:
     indexed = 0
     for batch_start in range(0, len(chunks), _EMBED_BATCH):
-        batch   = chunks[batch_start : batch_start + _EMBED_BATCH]
-        vectors = _embed_batch(batch)
         points = [
             PointStruct(
@@ -157,7 +259,10 @@ def index_pdf(pdf_path: str, pdf_name: str, conversation_id: str) -> int:
                     uuid.NAMESPACE_DNS,
                     f"{conversation_id}::{pdf_name}::{batch_start + i}",
                 )),
-                vector=vectors[i],
                 payload={
                     "conversation_id": conversation_id,
                     "pdf_name":        pdf_name,
@@ -180,15 +285,13 @@ def index_pdf(pdf_path: str, pdf_name: str, conversation_id: str) -> int:
 def hybrid_search(query: str, conversation_id: str, top_k: int = 5) -> list[str]:
     """
-    Hybrid search kết hợp:
-      - Dense:   semantic search bằng OpenAI embedding
-      - Sparse:  full-text keyword search (Qdrant TextIndex)
-    Merge bằng Reciprocal Rank Fusion (RRF).
-    Tìm trên TOÀN BỘ PDF đã index cho conversation_id này.
-    Returns:
-        Danh sách chunk text liên quan nhất, sắp xếp theo RRF score.
     """
     client      = _get_qdrant()
     conv_filter = Filter(must=[
@@ -196,42 +299,47 @@ def hybrid_search(query: str, conversation_id: str, top_k: int = 5) -> list[str]
     ])
     # ── Dense search ──────────────────────────────────────────────────────────
-    query_vec  = _embed_one(query)
-    dense_hits = client.search(
         collection_name=_PDF_COLLECTION,
-        query_vector=query_vec,
         query_filter=conv_filter,
         limit=top_k * 3,
         with_payload=True,
-    )
-    # ── Keyword / full-text search ────────────────────────────────────────────
-    kw_filter = Filter(must=[
-        FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id)),
-        FieldCondition(key="chunk_text",      match=MatchText(text=query)),
-    ])
-    kw_hits, _ = client.scroll(
         collection_name=_PDF_COLLECTION,
-        scroll_filter=kw_filter,
         limit=top_k * 3,
         with_payload=True,
-        with_vectors=False,
-    )
     # ── RRF merge ─────────────────────────────────────────────────────────────
-    scores: dict[str, float] = {}
-    texts:  dict[str, str]   = {}
     for rank, hit in enumerate(dense_hits):
         sid = str(hit.id)
-        scores[sid] = scores.get(sid, 0.0) + 1.0 / (rank + _RRF_K)
-        texts[sid]  = hit.payload.get("chunk_text", "")
-    for rank, hit in enumerate(kw_hits):
         sid = str(hit.id)
         scores[sid] = scores.get(sid, 0.0) + 1.0 / (rank + _RRF_K)
-        if sid not in texts:
-            texts[sid] = hit.payload.get("chunk_text", "")
     top_ids = sorted(scores, key=scores.__getitem__, reverse=True)[:top_k]
-    return [texts[sid] for sid in top_ids if sid in texts]

 """
+PDF RAG — chunk, embed, index to Qdrant, hybrid search.
+Hybrid search = dense  (OpenAI text-embedding-3-small, cosine similarity)
+              + sparse (BM25 via fastembed Qdrant/bm25, dot product)
+              merged via Reciprocal Rank Fusion (RRF, k=60).
+Sau RRF, mỗi chunk được mở rộng sang các chunk lân cận (N-3 đến N+3) trong cùng
+PDF để đưa vào context đầy đủ hơn. Không dùng overlap vì neighbor expansion
+đã đảm bảo không mất context tại ranh giới chunk.
 """
 import logging
 import uuid
 from typing import Optional
+from fastembed import SparseTextEmbedding
 from openai import OpenAI
 from qdrant_client import QdrantClient
 from qdrant_client.models import (
     Distance,
     FieldCondition,
     Filter,
+    MatchAny,
     MatchValue,
     PointStruct,
+    SparseVector,
+    SparseVectorParams,
     VectorParams,
 )
 logger = logging.getLogger(__name__)
+# Collection v2: named vectors (dense + sparse). Xóa collection cũ "pdf_chunks" nếu còn.
+_PDF_COLLECTION  = "pdf_chunks_v2"
+_EMBED_MODEL     = "text-embedding-3-small"
+_EMBED_DIMS      = 1536
+_BM25_MODEL      = "Qdrant/bm25"
+_CHUNK_SIZE      = 1000  # ký tự / chunk
+_CHUNK_OVERLAP   = 0     # không cần overlap — neighbor expansion xử lý ranh giới
+_EMBED_BATCH     = 32    # số chunk embed song song mỗi lần
+_RRF_K           = 60    # hằng số RRF chuẩn
+_NEIGHBOR_WINDOW = 3     # fetch N-3 đến N+3 quanh mỗi chunk được retrieve
+_qdrant: Optional[QdrantClient]        = None
+_openai: Optional[OpenAI]             = None
+_bm25:   Optional[SparseTextEmbedding] = None
+# ── Client / model helpers ────────────────────────────────────────────────────
 def _get_qdrant() -> QdrantClient:
     global _qdrant
     return _openai
+def _get_bm25() -> SparseTextEmbedding:
+    global _bm25
+    if _bm25 is None:
+        _bm25 = SparseTextEmbedding(model_name=_BM25_MODEL)
+    return _bm25
 def _ensure_collection(client: QdrantClient) -> None:
     existing = {c.name for c in client.get_collections().collections}
     if _PDF_COLLECTION not in existing:
         client.create_collection(
             collection_name=_PDF_COLLECTION,
+            vectors_config={
+                "dense": VectorParams(size=_EMBED_DIMS, distance=Distance.COSINE),
+            },
+            sparse_vectors_config={
+                "sparse": SparseVectorParams(),
+            },
         )
+        logger.info("Qdrant: collection '%s' created.", _PDF_COLLECTION)
+    # Payload indexes — idempotent, an toàn gọi mỗi lần khởi động.
+    for field in ("conversation_id", "pdf_name"):
         client.create_payload_index(
             collection_name=_PDF_COLLECTION,
+            field_name=field,
+            field_schema="keyword",
         )
+    client.create_payload_index(
+        collection_name=_PDF_COLLECTION,
+        field_name="chunk_index",
+        field_schema="integer",
+    )
 # ── Chunking ──────────────────────────────────────────────────────────────────
 def _chunk_text(text: str) -> list[str]:
     if len(text) <= _CHUNK_SIZE:
         return [text.strip()] if text.strip() else []
         end = min(start + _CHUNK_SIZE, len(text))
         if end < len(text):
             for boundary in ('\n\n', '\n', '.', '!', '?'):
                 pos = text.rfind(boundary, start + _CHUNK_SIZE // 2, end)
                 if pos != -1:
         if chunk:
             chunks.append(chunk)
+        # _CHUNK_OVERLAP = 0, nhưng giữ công thức chung để dễ điều chỉnh sau
         next_start = end - _CHUNK_OVERLAP
         if next_start <= start:
+            next_start = end
         start = next_start
     return chunks
     return _embed_batch([text])[0]
+def _bm25_batch(texts: list[str]) -> list[SparseVector]:
+    embeddings = list(_get_bm25().embed(texts))
+    return [
+        SparseVector(indices=e.indices.tolist(), values=e.values.tolist())
+        for e in embeddings
+    ]
+def _bm25_one(text: str) -> SparseVector:
+    return _bm25_batch([text])[0]
+# ── Neighbor expansion ────────────────────────────────────────────────────────
+def _expand_chunks(
+    client: QdrantClient,
+    conversation_id: str,
+    hits: list[tuple[str, int]],  # (pdf_name, chunk_index)
+    window: int = _NEIGHBOR_WINDOW,
+) -> list[str]:
+    """
+    Với mỗi (pdf_name, chunk_index) được retrieve, fetch thêm chunk N-window đến N+window
+    từ cùng PDF. Các cửa sổ chồng lấp được merge thành một đoạn liên tục để
+    tránh đưa nội dung trùng lặp vào context.
+    Returns:
+        Danh sách đoạn văn bản, mỗi đoạn là một cửa sổ liên tục (đã merge nếu chồng lấp).
+    """
+    # Gom tất cả chunk_index cần fetch theo từng pdf_name
+    pdf_needed: dict[str, set[int]] = {}
+    for pdf_name, chunk_index in hits:
+        indices = set(range(max(0, chunk_index - window), chunk_index + window + 1))
+        pdf_needed.setdefault(pdf_name, set()).update(indices)
+    results: list[str] = []
+    for pdf_name, needed in pdf_needed.items():
+        fetch_filter = Filter(must=[
+            FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id)),
+            FieldCondition(key="pdf_name",        match=MatchValue(value=pdf_name)),
+            FieldCondition(key="chunk_index",     match=MatchAny(any=sorted(needed))),
+        ])
+        fetched, _ = client.scroll(
+            collection_name=_PDF_COLLECTION,
+            scroll_filter=fetch_filter,
+            limit=len(needed) + 5,
+            with_payload=True,
+            with_vectors=False,
+        )
+        # Map chunk_index → text, rồi sort
+        chunk_map = {
+            p.payload["chunk_index"]: p.payload.get("chunk_text", "")
+            for p in fetched
+            if "chunk_index" in p.payload
+        }
+        sorted_indices = sorted(chunk_map)
+        if not sorted_indices:
+            continue
+        # Gom các chunk_index liên tiếp thành từng run (merge overlapping windows)
+        runs: list[list[int]] = []
+        current: list[int] = [sorted_indices[0]]
+        for idx in sorted_indices[1:]:
+            if idx == current[-1] + 1:
+                current.append(idx)
+            else:
+                runs.append(current)
+                current = [idx]
+        runs.append(current)
+        for run in runs:
+            text = "\n\n".join(chunk_map[i] for i in run)
+            if text.strip():
+                results.append(text)
+    return results
 # ── Public API ────────────────────────────────────────────────────────────────
 def index_pdf(pdf_path: str, pdf_name: str, conversation_id: str) -> int:
     """
+    Đọc PDF, chunk, embed (dense + sparse) và upsert vào Qdrant.
+    UUID v5 làm point ID đảm bảo idempotent — gửi lại cùng file không tạo duplicate.
     Returns:
         Số chunk đã index.
     indexed = 0
     for batch_start in range(0, len(chunks), _EMBED_BATCH):
+        batch       = chunks[batch_start : batch_start + _EMBED_BATCH]
+        dense_vecs  = _embed_batch(batch)
+        sparse_vecs = _bm25_batch(batch)
         points = [
             PointStruct(
                     uuid.NAMESPACE_DNS,
                     f"{conversation_id}::{pdf_name}::{batch_start + i}",
                 )),
+                vector={
+                    "dense":  dense_vecs[i],
+                    "sparse": sparse_vecs[i],
+                },
                 payload={
                     "conversation_id": conversation_id,
                     "pdf_name":        pdf_name,
 def hybrid_search(query: str, conversation_id: str, top_k: int = 5) -> list[str]:
     """
+    Hybrid search:
+      Dense  — OpenAI cosine similarity, Qdrant trả cosine score.
+      Sparse — BM25 dot product, Qdrant trả BM25 score.
+      Merge  — RRF: score = 1/(k + rank_dense) + 1/(k + rank_sparse).
+    Sau RRF, mỗi chunk được mở rộng sang N-3 đến N+3 trong cùng PDF.
+    Các cửa sổ chồng lấp tự động được merge thành đoạn liên tục.
     """
     client      = _get_qdrant()
     conv_filter = Filter(must=[
     ])
     # ── Dense search ──────────────────────────────────────────────────────────
+    dense_hits = client.query_points(
         collection_name=_PDF_COLLECTION,
+        query=_embed_one(query),
+        using="dense",
         query_filter=conv_filter,
         limit=top_k * 3,
         with_payload=True,
+    ).points
+    # ── Sparse (BM25) search ──────────────────────────────────────────────────
+    bm25_vec    = _bm25_one(query)
+    sparse_hits = client.query_points(
         collection_name=_PDF_COLLECTION,
+        query=SparseVector(indices=bm25_vec.indices, values=bm25_vec.values),
+        using="sparse",
+        query_filter=conv_filter,
         limit=top_k * 3,
         with_payload=True,
+    ).points
     # ── RRF merge ─────────────────────────────────────────────────────────────
+    scores:   dict[str, float] = {}
+    payloads: dict[str, dict]  = {}
     for rank, hit in enumerate(dense_hits):
         sid = str(hit.id)
+        scores[sid]   = scores.get(sid, 0.0) + 1.0 / (rank + _RRF_K)
+        payloads[sid] = hit.payload
+    for rank, hit in enumerate(sparse_hits):
         sid = str(hit.id)
         scores[sid] = scores.get(sid, 0.0) + 1.0 / (rank + _RRF_K)
+        if sid not in payloads:
+            payloads[sid] = hit.payload
     top_ids = sorted(scores, key=scores.__getitem__, reverse=True)[:top_k]
+    # ── Neighbor expansion ────────────────────────────────────────────────────
+    hits_meta = [
+        (payloads[sid].get("pdf_name", ""), payloads[sid].get("chunk_index", 0))
+        for sid in top_ids
+        if sid in payloads
+    ]
+    return _expand_chunks(client, conversation_id, hits_meta)

prompts.py CHANGED Viewed

@@ -179,6 +179,14 @@ Nhiệm vụ: phân tích yêu cầu và gọi đúng công cụ để xử lý.
   read_link(url)
       → Đọc và trích xuất nội dung từ đường link URL.
 ═══ CHIẾN LƯỢC GỌI TOOL ═══
   Hỏi về nội dung đã thảo luận / tóm tắt   → summarize_chat
@@ -189,6 +197,7 @@ Nhiệm vụ: phân tích yêu cầu và gọi đúng công cụ để xử lý.
   Muốn tra cứu thông tin đã lưu             → get_memories
   Cần đọc nội dung từ link                  → read_link
   Muốn đặt nhắc nhở                         → add_reminder
 ═══ QUY TẮC BẮT BUỘC ═══

   read_link(url)
       → Đọc và trích xuất nội dung từ đường link URL.
+📚 KNOWLEDGE BASE (Tài liệu lớp học):
+  rag_search(query, conversation_id)
+      → Tìm kiếm trong tài liệu PDF đã được index cho conversation/room này
+        (lecture notes, slides, handouts, đề cương...).
+        Dùng khi người dùng hỏi về nội dung bài học, tài liệu đã chia sẻ,
+        hoặc kiến thức chuyên ngành liên quan đến lớp học.
+        Luôn truyền conversation_id từ thông tin được cung cấp.
 ═══ CHIẾN LƯỢC GỌI TOOL ═══
   Hỏi về nội dung đã thảo luận / tóm tắt   → summarize_chat
   Muốn tra cứu thông tin đã lưu             → get_memories
   Cần đọc nội dung từ link                  → read_link
   Muốn đặt nhắc nhở                         → add_reminder
+  Hỏi về nội dung bài học / tài liệu lớp   → rag_search
 ═══ QUY TẮC BẮT BUỘC ═══

tools/chat_tools.py CHANGED Viewed

@@ -8,6 +8,7 @@ from . import memory as _memory_mod  # noqa: F401
 from . import scheduler as _scheduler_mod  # noqa: F401
 from . import summarizer as _summarizer_mod  # noqa: F401
 from . import chart as _chart_mod  # noqa: F401
 from .base import TOOLS as _REGISTRY, get_langchain_tools
@@ -22,6 +23,8 @@ _ALLOWED = {
     "save_memory", "get_memories",
     # Web
     "read_link",
 }
 TOOLS    = [t for t in get_langchain_tools() if t.name in _ALLOWED]

 from . import scheduler as _scheduler_mod  # noqa: F401
 from . import summarizer as _summarizer_mod  # noqa: F401
 from . import chart as _chart_mod  # noqa: F401
+from . import rag as _rag_mod  # noqa: F401
 from .base import TOOLS as _REGISTRY, get_langchain_tools
     "save_memory", "get_memories",
     # Web
     "read_link",
+    # Knowledge base (PDF RAG)
+    "rag_search",
 }
 TOOLS    = [t for t in get_langchain_tools() if t.name in _ALLOWED]