Skip to content

Commit f0d491e

Browse files
authored
Merge pull request #1020 from Valyrian-Code/fix/dedup-public-dataloss
Avoid data loss in dedup_public.py when a link cannot be created
2 parents 1f64f48 + bc5efe1 commit f0d491e

2 files changed

Lines changed: 79 additions & 4 deletions

File tree

‎scripts/dedup_public.py‎

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,16 +179,28 @@ def is_allowed(path: Path) -> bool:
179179
if args.dry_run:
180180
print(f"DRY-RUN: replace {p} -> link to {rel_target}")
181181
else:
182+
# Create the replacement link at a temporary path first, then
183+
# atomically move it over the original. Unlinking the original
184+
# before the link existed meant a failed os.symlink/os.link left
185+
# the file deleted with no replacement, losing it permanently.
186+
tmp = p.with_name(p.name + '.dedup-tmp')
182187
try:
183-
p.unlink()
188+
if tmp.is_symlink() or tmp.exists():
189+
tmp.unlink()
184190
if args.mode == 'symlink':
185-
os.symlink(rel_target, p)
191+
os.symlink(rel_target, tmp)
186192
else:
187-
os.link(canonical, p)
193+
os.link(canonical, tmp)
194+
os.replace(tmp, p)
188195
linked += 1
189196
except Exception as e:
190197
print(f"ERROR: Failed to link {p} -> {rel_target}: {e}", file=sys.stderr)
191-
# If linking fails, leave file as-is
198+
# Clean up the temporary link; the original file is intact.
199+
try:
200+
if tmp.is_symlink() or tmp.exists():
201+
tmp.unlink()
202+
except OSError:
203+
pass
192204
processed += 1
193205

194206
print(f"Dedup complete. processed={processed} linked={linked} unique={len(canonical_by_hash)} base={base_root}")

‎test/test_dedup_public.py‎

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# -*- coding: utf-8 -*-
2+
"""Regression tests for scripts/dedup_public.py.
3+
4+
The deduplicator replaces files that are byte-identical to a base-language file
5+
with a link to that canonical copy. It must never delete a file before the
6+
replacement link exists, otherwise a failed link leaves the published asset
7+
gone for good.
8+
"""
9+
import os
10+
import sys
11+
12+
import dedup_public
13+
14+
15+
def _run(public_dir, mode="symlink"):
16+
argv = [
17+
"dedup_public.py",
18+
"--public-dir", str(public_dir),
19+
"--ext", "png",
20+
"--mode", mode,
21+
]
22+
old_argv = sys.argv
23+
sys.argv = argv
24+
try:
25+
return dedup_public.main()
26+
finally:
27+
sys.argv = old_argv
28+
29+
30+
def _make_duplicate_tree(tmp_path):
31+
"""A canonical file at the root and an identical copy under a subdir."""
32+
content = b"DUPLICATE-IMAGE-BYTES"
33+
(tmp_path / "logo.png").write_bytes(content)
34+
sub = tmp_path / "fr"
35+
sub.mkdir()
36+
dup = sub / "logo.png"
37+
dup.write_bytes(content)
38+
return dup, content
39+
40+
41+
def test_failed_link_does_not_delete_original(tmp_path, monkeypatch):
42+
dup, content = _make_duplicate_tree(tmp_path)
43+
44+
def boom(*args, **kwargs):
45+
raise OSError("simulated link failure")
46+
47+
monkeypatch.setattr(dedup_public.os, "symlink", boom)
48+
49+
assert _run(tmp_path) == 0
50+
# The duplicate must survive intact when the link could not be created.
51+
assert dup.is_file() and not dup.is_symlink()
52+
assert dup.read_bytes() == content
53+
54+
55+
def test_duplicate_is_replaced_with_symlink_on_success(tmp_path):
56+
dup, content = _make_duplicate_tree(tmp_path)
57+
58+
assert _run(tmp_path) == 0
59+
# On success the duplicate becomes a symlink that still resolves to the
60+
# original bytes, and no stale temp file is left behind.
61+
assert dup.is_symlink()
62+
assert dup.read_bytes() == content
63+
assert not (dup.parent / (dup.name + ".dedup-tmp")).exists()

0 commit comments

Comments
 (0)