|
58 | 58 | SOURCE_IDS_LIMIT_METHOD_FIFO, |
59 | 59 | DEFAULT_FILE_PATH_MORE_PLACEHOLDER, |
60 | 60 | DEFAULT_MAX_FILE_PATHS, |
| 61 | + DEFAULT_ENTITY_NAME_MAX_LENGTH, |
61 | 62 | ) |
62 | 63 | from lightrag.kg.shared_storage import get_storage_keyed_lock |
63 | 64 | import time |
|
69 | 70 | load_dotenv(dotenv_path=".env", override=False) |
70 | 71 |
|
71 | 72 |
|
| 73 | +def _truncate_entity_identifier( |
| 74 | + identifier: str, limit: int, chunk_key: str, identifier_role: str |
| 75 | +) -> str: |
| 76 | + """Truncate entity identifiers that exceed the configured length limit.""" |
| 77 | + |
| 78 | + if len(identifier) <= limit: |
| 79 | + return identifier |
| 80 | + |
| 81 | + display_value = identifier[:limit] |
| 82 | + preview = identifier[:20] # Show first 20 characters as preview |
| 83 | + logger.warning( |
| 84 | + "%s: %s exceeded %d characters (len: %d, preview: '%s...'", |
| 85 | + chunk_key, |
| 86 | + identifier_role, |
| 87 | + limit, |
| 88 | + len(identifier), |
| 89 | + preview, |
| 90 | + ) |
| 91 | + return display_value |
| 92 | + |
| 93 | + |
72 | 94 | def chunking_by_token_size( |
73 | 95 | tokenizer: Tokenizer, |
74 | 96 | content: str, |
@@ -952,17 +974,36 @@ async def _process_extraction_result( |
952 | 974 | record_attributes, chunk_key, timestamp, file_path |
953 | 975 | ) |
954 | 976 | if entity_data is not None: |
955 | | - maybe_nodes[entity_data["entity_name"]].append(entity_data) |
| 977 | + truncated_name = _truncate_entity_identifier( |
| 978 | + entity_data["entity_name"], |
| 979 | + DEFAULT_ENTITY_NAME_MAX_LENGTH, |
| 980 | + chunk_key, |
| 981 | + "Entity name", |
| 982 | + ) |
| 983 | + entity_data["entity_name"] = truncated_name |
| 984 | + maybe_nodes[truncated_name].append(entity_data) |
956 | 985 | continue |
957 | 986 |
|
958 | 987 | # Try to parse as relationship |
959 | 988 | relationship_data = await _handle_single_relationship_extraction( |
960 | 989 | record_attributes, chunk_key, timestamp, file_path |
961 | 990 | ) |
962 | 991 | if relationship_data is not None: |
963 | | - maybe_edges[ |
964 | | - (relationship_data["src_id"], relationship_data["tgt_id"]) |
965 | | - ].append(relationship_data) |
| 992 | + truncated_source = _truncate_entity_identifier( |
| 993 | + relationship_data["src_id"], |
| 994 | + DEFAULT_ENTITY_NAME_MAX_LENGTH, |
| 995 | + chunk_key, |
| 996 | + "Relationship source entity", |
| 997 | + ) |
| 998 | + truncated_target = _truncate_entity_identifier( |
| 999 | + relationship_data["tgt_id"], |
| 1000 | + DEFAULT_ENTITY_NAME_MAX_LENGTH, |
| 1001 | + chunk_key, |
| 1002 | + "Relationship target entity", |
| 1003 | + ) |
| 1004 | + relationship_data["src_id"] = truncated_source |
| 1005 | + relationship_data["tgt_id"] = truncated_target |
| 1006 | + maybe_edges[(truncated_source, truncated_target)].append(relationship_data) |
966 | 1007 |
|
967 | 1008 | return dict(maybe_nodes), dict(maybe_edges) |
968 | 1009 |
|
|
0 commit comments