|
17 | 17 | from lightrag.utils import xml_to_json |
18 | 18 | from neo4j import GraphDatabase |
19 | 19 | from firecrawl import FirecrawlApp #pip install firecrawl-py |
| 20 | +import fitz # PyMuPDF |
20 | 21 | import tika |
21 | 22 | from tika import parser as tikaParser |
22 | 23 | TIKA_SERVER_JAR = "file:////media/wac/backup/john/johnson/LightRAG/examples/tika-server.jar" |
@@ -75,6 +76,49 @@ def read_file_content(file_path): |
75 | 76 | content = content_text.split("\n") |
76 | 77 | return content |
77 | 78 |
|
| 79 | +def average_pdf_text_num(pdf_path): |
| 80 | + """ |
| 81 | + 计算PDF文档所有页面的平均文字数量 |
| 82 | + |
| 83 | + Args: |
| 84 | + pdf_path (str): PDF文件路径 |
| 85 | + |
| 86 | + Returns: |
| 87 | + float: 平均每页文字数量 |
| 88 | + """ |
| 89 | + doc = fitz.open(pdf_path) |
| 90 | + total_text_length = 0 |
| 91 | + page_count = len(doc) |
| 92 | + |
| 93 | + for page_num in range(page_count): |
| 94 | + page = doc[page_num] |
| 95 | + text = page.get_text() |
| 96 | + total_text_length += len(text) |
| 97 | + |
| 98 | + doc.close() # 关闭文档 |
| 99 | + average_text_num = total_text_length / page_count if page_count > 0 else 0 |
| 100 | + return average_text_num |
| 101 | + |
| 102 | +def analyze_pdf_page(pdf_path, page_number): |
| 103 | + doc = fitz.open(pdf_path) |
| 104 | + page = doc[page_number - 1] # 页码从0开始 |
| 105 | + |
| 106 | + # 提取文字 |
| 107 | + text = page.get_text() |
| 108 | + text_length = len(text) |
| 109 | + |
| 110 | + # 提取图片 |
| 111 | + images = page.get_images(full=True) |
| 112 | + image_count = len(images) |
| 113 | + |
| 114 | + doc.close() |
| 115 | + |
| 116 | + return { |
| 117 | + "text_length": text_length, |
| 118 | + "image_count": image_count, |
| 119 | + "is_image_dominant": image_count > 0 and text_length < 100 # 简单判断 |
| 120 | + } |
| 121 | + |
78 | 122 |
|
79 | 123 | def convert_xml_to_json(xml_path): |
80 | 124 | """Converts XML file to JSON and saves the output.""" |
|
0 commit comments