Skip to content

Commit b48a418

Browse files
author
johnson
committed
2024-12-17_13:21
1 parent 0d68eeb commit b48a418

File tree

3 files changed

+46
-1
lines changed

3 files changed

+46
-1
lines changed

‎.gitignore‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ examples/cache/
1616
examples/data
1717
examples/logs
1818
examples
19+
logs

‎examples/LightRAG_utils.py‎

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from lightrag.utils import xml_to_json
1818
from neo4j import GraphDatabase
1919
from firecrawl import FirecrawlApp #pip install firecrawl-py
20+
import fitz # PyMuPDF
2021
import tika
2122
from tika import parser as tikaParser
2223
TIKA_SERVER_JAR = "file:////media/wac/backup/john/johnson/LightRAG/examples/tika-server.jar"
@@ -75,6 +76,49 @@ def read_file_content(file_path):
7576
content = content_text.split("\n")
7677
return content
7778

79+
def average_pdf_text_num(pdf_path):
80+
"""
81+
计算PDF文档所有页面的平均文字数量
82+
83+
Args:
84+
pdf_path (str): PDF文件路径
85+
86+
Returns:
87+
float: 平均每页文字数量
88+
"""
89+
doc = fitz.open(pdf_path)
90+
total_text_length = 0
91+
page_count = len(doc)
92+
93+
for page_num in range(page_count):
94+
page = doc[page_num]
95+
text = page.get_text()
96+
total_text_length += len(text)
97+
98+
doc.close() # 关闭文档
99+
average_text_num = total_text_length / page_count if page_count > 0 else 0
100+
return average_text_num
101+
102+
def analyze_pdf_page(pdf_path, page_number):
103+
doc = fitz.open(pdf_path)
104+
page = doc[page_number - 1] # 页码从0开始
105+
106+
# 提取文字
107+
text = page.get_text()
108+
text_length = len(text)
109+
110+
# 提取图片
111+
images = page.get_images(full=True)
112+
image_count = len(images)
113+
114+
doc.close()
115+
116+
return {
117+
"text_length": text_length,
118+
"image_count": image_count,
119+
"is_image_dominant": image_count > 0 and text_length < 100 # 简单判断
120+
}
121+
78122

79123
def convert_xml_to_json(xml_path):
80124
"""Converts XML file to JSON and saves the output."""

‎lightrag/lightrag.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ async def ainsert_only_naive(self, string_or_strings):
476476
logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
477477

478478
await self.chunks_vdb.upsert(inserting_chunks)
479-
await self.text_chunks.upsert(inserting_chunks)
479+
await self.text_chunks.upsert(iwnserting_chunks)
480480
finally:
481481
if update_storage:
482482
await self._insert_done()

0 commit comments

Comments
 (0)