anhkhoiphan commited on
Commit
20a314b
·
1 Parent(s): 0772afe

Thêm các hàm xử lý pdf

Browse files
Files changed (1) hide show
  1. pdf_processing.py +62 -0
pdf_processing.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import pdfplumber
4
+
5
+
6
+ def _clean_text(text: str) -> str:
7
+ if not text:
8
+ return ""
9
+ text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
10
+ lines = [line.rstrip() for line in text.split('\n')]
11
+ return '\n'.join(lines).strip()
12
+
13
+
14
+ def _table_to_markdown(table: list) -> str:
15
+ if not table:
16
+ return ""
17
+ cleaned = [[str(c).strip() if c else "" for c in row] for row in table]
18
+ num_cols = len(cleaned[0])
19
+ col_widths = [0] * num_cols
20
+ for row in cleaned:
21
+ for i, cell in enumerate(row[:num_cols]):
22
+ col_widths[i] = max(col_widths[i], len(cell))
23
+ lines = []
24
+ header = cleaned[0]
25
+ lines.append("| " + " | ".join(c.ljust(col_widths[i]) for i, c in enumerate(header[:num_cols])) + " |")
26
+ lines.append("| " + " | ".join("-" * w for w in col_widths) + " |")
27
+ for row in cleaned[1:]:
28
+ lines.append("| " + " | ".join(c.ljust(col_widths[i]) for i, c in enumerate(row[:num_cols])) + " |")
29
+ return "\n".join(lines)
30
+
31
+
32
+ def pdf_to_markdown(pdf_path: str) -> str:
33
+ parts = []
34
+ with pdfplumber.open(pdf_path) as pdf:
35
+ total = len(pdf.pages)
36
+ for n, page in enumerate(pdf.pages, 1):
37
+ parts.append(f"\n---\n## Trang {n}/{total}\n")
38
+ tables = page.extract_tables()
39
+ text = page.extract_text()
40
+ if text:
41
+ parts.append(_clean_text(text))
42
+ parts.append("\n")
43
+ if tables:
44
+ for i, tbl in enumerate(tables, 1):
45
+ if tbl:
46
+ parts.append(f"\n**Bảng {i}:**\n")
47
+ parts.append(_table_to_markdown(tbl))
48
+ parts.append("\n")
49
+ return _clean_text("\n".join(parts))
50
+
51
+
52
+ def format_chat_history(messages: list[dict]) -> str:
53
+ if not messages:
54
+ return "(Không có lịch sử trò chuyện)"
55
+ lines = []
56
+ for m in messages:
57
+ sender = m.get("senderName") or m.get("sender_id") or "?"
58
+ content = m.get("content") or m.get("message") or ""
59
+ ts = m.get("timestamp") or m.get("created_at") or ""
60
+ prefix = f"[{ts}] " if ts else ""
61
+ lines.append(f"{prefix}{sender}: {content}")
62
+ return "\n".join(lines)