Skip to content

Commit c08a749

Browse files
committed
Add sanitize_title. Docs.
1 parent a1d1980 commit c08a749

File tree

4 files changed

+83
-38
lines changed

4 files changed

+83
-38
lines changed

‎README.md‎

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,47 @@ strings, timestamps, ages, and sizes.
99

1010
Simply a more convenient wrapper around `humanize`, `humanfriendly`, and `strif`.
1111

12+
## Installation
13+
14+
```
15+
# Use pip
16+
pip install prettyfmt
17+
# Or poetry
18+
poetry add prettyfmt
19+
```
20+
21+
## Usage
22+
1223
```python
1324
from prettyfmt import *
1425

26+
abbrev_str("very " * 100 + "long", 32)
27+
🢂 'very very very very very very v…'
28+
1529
# Simple abbreviations of objects:
1630
abbrev_obj({"a": "very " * 100 + "long", "b": 23})
1731
🢂 "{a='very very very very very very very very very very very very ver…', b=23}"
1832

33+
abbrev_obj(["word " * i for i in range(10)], field_max_len=10, list_max_len=4)
34+
🢂 "['', 'word ', 'word word ', 'word word…', …]"
35+
1936
# Abbreviate but don't break words. Combine with slugifiers.
2037
abbrev_on_words("very " * 100 + "long", 30)
2138
🢂 'very very very very very very…'
2239

2340
# My favorite, very good for abbreviating a long title to get a shorter one,
2441
# or good filename.
2542
abbrev_phrase_in_middle("very " * 100 + "long", 40)
26-
🢂 'very very very very … very very very long'
43+
🢂 'very very very v
44+
45+
# Useful for cleaning up document titles and filenames.
46+
ugly_title = "A Very\tVery Very Needlessly Long {Strange} Document Title [final edited draft23]"
47+
abbrev_phrase_in_middle(sanitize_title(ugly_title))
48+
🢂 'A Very Very Very Needlessly Long Strange … final edited draft23'
49+
50+
from slugify import slugify
51+
slugify(abbrev_phrase_in_middle(sanitize_title(ugly_title)))
52+
🢂 'a-very-very-very-needlessly-long-strange-final-edited-draft23'
2753

2854
# Ages in seconds or deltas.
2955
fmt_age(60 * 60 * 24 * 23)

‎src/prettyfmt/__init__.py‎

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
1+
from .prettyfmt import * # noqa: F403
2+
from strif import abbrev_list, abbrev_str, quote_if_needed, single_line
3+
14
__all__ = ( # noqa: F405
25
"abbrev_obj",
36
"abbrev_on_words",
47
"abbrev_phrase_in_middle",
5-
"abbrev_str",
68
"fmt_age",
79
"fmt_time",
810
"fmt_size_human",
911
"fmt_size_dual",
1012
"fmt_words",
1113
"fmt_paras",
14+
"sanitize_title",
15+
# Re-export strif functions for convenience:
16+
"abbrev_str",
17+
"abbrev_list",
18+
"single_line",
19+
"quote_if_needed",
1220
)
13-
14-
from .prettyfmt import * # noqa: F403

‎src/prettyfmt/prettyfmt.py‎

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -128,15 +128,11 @@ def __str__(self) -> str:
128128
visited.add(id(value))
129129

130130
if isinstance(value, list):
131-
truncated_list = value[:list_max_len] + (
132-
["…"] if len(value) > list_max_len else []
133-
)
131+
truncated_list = value[:list_max_len] + (["…"] if len(value) > list_max_len else [])
134132
return (
135133
"["
136134
+ ", ".join(
137-
abbrev_obj(
138-
item, field_max_len, list_max_len, key_filter, value_filter, visited
139-
)
135+
abbrev_obj(item, field_max_len, list_max_len, key_filter, value_filter, visited)
140136
for item in truncated_list
141137
)
142138
+ "]"
@@ -152,11 +148,7 @@ def __str__(self) -> str:
152148
)
153149

154150
if isinstance(value, dict):
155-
return (
156-
"{"
157-
+ _format_kvs(value.items(), field_max_len, key_filter, value_filter)
158-
+ "}"
159-
)
151+
return "{" + _format_kvs(value.items(), field_max_len, key_filter, value_filter) + "}"
160152

161153
if isinstance(value, Enum):
162154
return value.name
@@ -168,10 +160,11 @@ def _trim_trailing_punctuation(text: str) -> str:
168160
return re.sub(r"[.,;:!?]+$", "", text)
169161

170162

171-
def abbrev_on_words(text: str, max_len: int, indicator: str = "…") -> str:
163+
def abbrev_on_words(text: str, max_len: int = 64, indicator: str = "…") -> str:
172164
"""
173-
Abbreviate text to a maximum length, breaking on whole words (unless the first word
174-
is too long). For aesthetics, removes trailing punctuation from the last word.
165+
Abbreviate text to a maximum character length, breaking on whole words
166+
(unless the first word is too long). For aesthetics, removes trailing
167+
punctuation from the last word.
175168
"""
176169
if len(text) <= max_len:
177170
return text
@@ -180,21 +173,19 @@ def abbrev_on_words(text: str, max_len: int, indicator: str = "…") -> str:
180173
if words and max_len and len(words[0]) > max_len:
181174
return abbrev_str(words[0], max_len, indicator)
182175

183-
while (
184-
words
185-
and len(_trim_trailing_punctuation(" ".join(words))) + len(indicator) > max_len
186-
):
176+
while words and len(_trim_trailing_punctuation(" ".join(words))) + len(indicator) > max_len:
187177
words.pop()
188178

189179
return _trim_trailing_punctuation(" ".join(words)) + indicator
190180

191181

192182
def abbrev_phrase_in_middle(
193-
phrase: str, max_len: int, ellipsis="…", max_trailing_len: int = 0
183+
phrase: str, max_len: int = 64, ellipsis="…", max_trailing_len: int = 0
194184
) -> str:
195185
"""
196-
Abbreviate a phrase to a maximum length, preserving the first and last few words of
197-
the phrase whenever possible. The ellipsis is inserted in the middle of the phrase.
186+
Abbreviate a phrase to a maximum character length, preserving the first and last
187+
few words of the phrase whenever possible. The ellipsis is inserted in the middle
188+
of the phrase.
198189
"""
199190
if not max_trailing_len:
200191
max_trailing_len = min(int(max_len / 2), max(16, int(max_len / 4)))
@@ -214,10 +205,7 @@ def abbrev_phrase_in_middle(
214205
# Walk through the split words, and tally total number of chars as we go.
215206
for i in range(len(words)):
216207
words[i] = abbrev_str(words[i], max_len, ellipsis)
217-
if (
218-
prefix_tally + len(words[i]) + len(ellipsis) + max_trailing_len >= max_len
219-
and i > 0
220-
):
208+
if prefix_tally + len(words[i]) + len(ellipsis) + max_trailing_len >= max_len and i > 0:
221209
prefix_end_index = i
222210
break
223211
prefix_tally += len(words[i]) + 1
@@ -377,3 +365,19 @@ def fmt_paras(*paras: str | None, sep: str = "\n\n") -> str:
377365
"""
378366
filtered_paras = [para.strip() for para in paras if para is not None]
379367
return sep.join(para for para in filtered_paras if para)
368+
369+
370+
DEFAULT_PUNCTUATION = ",./:;'!?/@%&()+“”‘’…–—-"
371+
372+
373+
def sanitize_title(text: str, allowed_chars: str = DEFAULT_PUNCTUATION) -> str:
374+
"""
375+
Simple sanitization for arbitrary text to make it suitable for a title or filename.
376+
Convert all whitespace to spaces. By default allows the most common punctuation,
377+
letters, and numbers, but not Markdown chars like `*` or `[]`, code characters, etc.
378+
"""
379+
# Note \w and \d should now be pretty good for common Unicode letters and digits.
380+
# If we had the regex package on hand we could use \p{L}\p{N} instead of \w\d
381+
# but probably not worth the import.
382+
escaped_chars = re.escape(allowed_chars)
383+
return re.sub(r"[^\w\d" + escaped_chars + "]+", " ", text).strip()

‎tests/test_prettyfmt.py‎

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
from prettyfmt import abbrev_on_words, abbrev_phrase_in_middle, fmt_words
1+
from prettyfmt import (
2+
abbrev_on_words,
3+
abbrev_phrase_in_middle,
4+
fmt_words,
5+
sanitize_title,
6+
)
27

38

49
def test_abbreviate_on_words():
@@ -16,17 +21,12 @@ def test_abbreviate_on_words():
1621

1722

1823
def test_abbreviate_phrase_in_middle():
24+
assert abbrev_phrase_in_middle("Hello, World! This is a test.", 16) == "Hello, … a test."
1925
assert (
20-
abbrev_phrase_in_middle("Hello, World! This is a test.", 16)
21-
== "Hello, … a test."
26+
abbrev_phrase_in_middle("Hello, World! This is a test.", 23) == "Hello, … This is a test."
2227
)
2328
assert (
24-
abbrev_phrase_in_middle("Hello, World! This is a test.", 23)
25-
== "Hello, … This is a test."
26-
)
27-
assert (
28-
abbrev_phrase_in_middle("Hello, World! This is a test.", 27)
29-
== "Hello, … This is a test."
29+
abbrev_phrase_in_middle("Hello, World! This is a test.", 27) == "Hello, … This is a test."
3030
)
3131
assert (
3232
abbrev_phrase_in_middle("Hello, World! This is a test.", 40)
@@ -69,3 +69,12 @@ def test_fmt_words():
6969
assert fmt_words("Hello", " ", "World", sep="---") == "Hello--- ---World"
7070
assert fmt_words("Hello", "World", sep=" | ") == "Hello | World"
7171
assert fmt_words(" Hello ", " ", " World ") == " Hello World "
72+
73+
74+
def test_sanitize_title():
75+
assert sanitize_title("Hello, World!") == "Hello, World!"
76+
assert sanitize_title("Hej, Världen!") == "Hej, Världen!"
77+
assert sanitize_title("你好 世界") == "你好 世界"
78+
assert sanitize_title("こんにちは、世界") == "こんにちは 世界"
79+
assert sanitize_title(" *Hello,* \n\tWorld! --123@:': ") == "Hello, World! --123@:':"
80+
assert sanitize_title("<script foo='blah'><p>") == "script foo 'blah' p"

0 commit comments

Comments
 (0)