Skip to content

Commit cf2174b

Browse files
authored
Merge pull request HKUDS#2245 from danielaskdd/entity-name-len
Refact: Add Entity Identifier Length Truncation to Prevent Storage Failures
2 parents 3ba1d75 + c92ab83 commit cf2174b

File tree

3 files changed

+47
-5
lines changed

3 files changed

+47
-5
lines changed

‎env.example‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ SUMMARY_LANGUAGE=English
138138
### control the maximum chunk_ids stored in vector and graph db
139139
# MAX_SOURCE_IDS_PER_ENTITY=300
140140
# MAX_SOURCE_IDS_PER_RELATION=300
141-
### control chunk_ids limitation method: FIFO, FIFO
141+
### control chunk_ids limitation method: FIFO, KEEP
142142
### FIFO: First in first out
143143
### KEEP: Keep oldest (less merge action and faster)
144144
# SOURCE_IDS_LIMIT_METHOD=FIFO

‎lightrag/constants.py‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# Default values for extraction settings
1414
DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
1515
DEFAULT_MAX_GLEANING = 1
16+
DEFAULT_ENTITY_NAME_MAX_LENGTH = 256
1617

1718
# Number of description fragments to trigger LLM summary
1819
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8

‎lightrag/operate.py‎

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
SOURCE_IDS_LIMIT_METHOD_FIFO,
5959
DEFAULT_FILE_PATH_MORE_PLACEHOLDER,
6060
DEFAULT_MAX_FILE_PATHS,
61+
DEFAULT_ENTITY_NAME_MAX_LENGTH,
6162
)
6263
from lightrag.kg.shared_storage import get_storage_keyed_lock
6364
import time
@@ -69,6 +70,27 @@
6970
load_dotenv(dotenv_path=".env", override=False)
7071

7172

73+
def _truncate_entity_identifier(
74+
identifier: str, limit: int, chunk_key: str, identifier_role: str
75+
) -> str:
76+
"""Truncate entity identifiers that exceed the configured length limit."""
77+
78+
if len(identifier) <= limit:
79+
return identifier
80+
81+
display_value = identifier[:limit]
82+
preview = identifier[:20] # Show first 20 characters as preview
83+
logger.warning(
84+
"%s: %s exceeded %d characters (len: %d, preview: '%s...'",
85+
chunk_key,
86+
identifier_role,
87+
limit,
88+
len(identifier),
89+
preview,
90+
)
91+
return display_value
92+
93+
7294
def chunking_by_token_size(
7395
tokenizer: Tokenizer,
7496
content: str,
@@ -952,17 +974,36 @@ async def _process_extraction_result(
952974
record_attributes, chunk_key, timestamp, file_path
953975
)
954976
if entity_data is not None:
955-
maybe_nodes[entity_data["entity_name"]].append(entity_data)
977+
truncated_name = _truncate_entity_identifier(
978+
entity_data["entity_name"],
979+
DEFAULT_ENTITY_NAME_MAX_LENGTH,
980+
chunk_key,
981+
"Entity name",
982+
)
983+
entity_data["entity_name"] = truncated_name
984+
maybe_nodes[truncated_name].append(entity_data)
956985
continue
957986

958987
# Try to parse as relationship
959988
relationship_data = await _handle_single_relationship_extraction(
960989
record_attributes, chunk_key, timestamp, file_path
961990
)
962991
if relationship_data is not None:
963-
maybe_edges[
964-
(relationship_data["src_id"], relationship_data["tgt_id"])
965-
].append(relationship_data)
992+
truncated_source = _truncate_entity_identifier(
993+
relationship_data["src_id"],
994+
DEFAULT_ENTITY_NAME_MAX_LENGTH,
995+
chunk_key,
996+
"Relationship source entity",
997+
)
998+
truncated_target = _truncate_entity_identifier(
999+
relationship_data["tgt_id"],
1000+
DEFAULT_ENTITY_NAME_MAX_LENGTH,
1001+
chunk_key,
1002+
"Relationship target entity",
1003+
)
1004+
relationship_data["src_id"] = truncated_source
1005+
relationship_data["tgt_id"] = truncated_target
1006+
maybe_edges[(truncated_source, truncated_target)].append(relationship_data)
9661007

9671008
return dict(maybe_nodes), dict(maybe_edges)
9681009

0 commit comments

Comments
 (0)