Skip to content

Commit a0a5df2

Browse files
Iteration 296: +hashPandasObject — FNV-1a 64-bit hashing
Run: https://github.com/githubnext/tsessebe/actions/runs/25139337654 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 2a497b1 commit a0a5df2

6 files changed

Lines changed: 525 additions & 0 deletions

File tree

‎playground/hash_pandas_object.html‎

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
6+
<title>tsb · hashPandasObject</title>
7+
<style>
8+
body { font-family: system-ui, sans-serif; max-width: 860px; margin: 2rem auto; padding: 0 1rem; }
9+
h1 { color: #1a1a2e; }
10+
h2 { color: #16213e; border-bottom: 1px solid #ddd; padding-bottom: 0.4rem; }
11+
pre { background: #f4f4f4; padding: 1rem; border-radius: 6px; overflow-x: auto; }
12+
code { font-family: "Fira Code", monospace; font-size: 0.9rem; }
13+
.output { background: #e8f5e9; border-left: 4px solid #4caf50; padding: 0.8rem 1rem; border-radius: 4px; }
14+
.note { background: #fff3cd; border-left: 4px solid #ffc107; padding: 0.8rem 1rem; border-radius: 4px; margin-top: 1rem; }
15+
a { color: #0066cc; }
16+
</style>
17+
</head>
18+
<body>
19+
<h1>📦 <code>hashPandasObject</code></h1>
20+
<p>
21+
Compute FNV-1a 64-bit hash values for each element of a
22+
<code>Series</code> or each row of a <code>DataFrame</code>.
23+
Mirrors <a href="https://pandas.pydata.org/docs/reference/api/pandas.util.hash_pandas_object.html"
24+
><code>pandas.util.hash_pandas_object</code></a>.
25+
</p>
26+
27+
<h2>Series hashing</h2>
28+
<pre><code>import { Series, hashPandasObject } from "tsb";
29+
30+
const s = new Series({ data: ["apple", "banana", "apple"], index: [0, 1, 2] });
31+
const h = hashPandasObject(s, { index: false });
32+
33+
// Same value → same hash
34+
console.log(h.iat(0) === h.iat(2)); // true (both "apple")
35+
console.log(h.iat(0) === h.iat(1)); // false ("apple" ≠ "banana")
36+
</code></pre>
37+
38+
<h2>DataFrame row hashing</h2>
39+
<pre><code>import { DataFrame, hashPandasObject } from "tsb";
40+
41+
const df = new DataFrame({
42+
id: [1, 2, 3],
43+
name: ["Alice", "Bob", "Alice"],
44+
age: [30, 25, 30],
45+
});
46+
47+
const rowHashes = hashPandasObject(df, { index: false });
48+
// Rows 0 and 2 are identical → same hash
49+
console.log(rowHashes.iat(0) === rowHashes.iat(2)); // true
50+
console.log(rowHashes.iat(0) === rowHashes.iat(1)); // false
51+
</code></pre>
52+
53+
<h2>Deduplication with hashes</h2>
54+
<pre><code>import { DataFrame, hashPandasObject } from "tsb";
55+
56+
const df = new DataFrame({
57+
a: [1, 2, 1, 3],
58+
b: ["x", "y", "x", "z"],
59+
});
60+
61+
const hashes = hashPandasObject(df, { index: false });
62+
const seen = new Set&lt;number&gt;();
63+
const uniqueRows: number[] = [];
64+
65+
for (let i = 0; i &lt; df.shape[0]; i++) {
66+
const h = hashes.iat(i);
67+
if (!seen.has(h)) {
68+
seen.add(h);
69+
uniqueRows.push(i);
70+
}
71+
}
72+
// uniqueRows = [0, 1, 3] — row 2 is a duplicate of row 0
73+
console.log(uniqueRows);
74+
</code></pre>
75+
76+
<h2>Controlling index inclusion</h2>
77+
<pre><code>import { Series, hashPandasObject } from "tsb";
78+
79+
const s = new Series({ data: [42, 42], index: ["a", "b"] });
80+
81+
// index=true (default): different index → different hash
82+
const withIdx = hashPandasObject(s, { index: true });
83+
console.log(withIdx.iat(0) === withIdx.iat(1)); // false
84+
85+
// index=false: only values matter
86+
const noIdx = hashPandasObject(s, { index: false });
87+
console.log(noIdx.iat(0) === noIdx.iat(1)); // true
88+
</code></pre>
89+
90+
<div class="note">
91+
<strong>Algorithm:</strong> FNV-1a 64-bit (Fowler–Noll–Vo), a fast non-cryptographic hash
92+
chosen for its excellent avalanche properties on short inputs. Results are stored as
93+
<code>float64</code> numbers (the 64-bit bit-pattern cast via <code>Number(BigInt)</code>).
94+
</div>
95+
96+
<p><a href="index.html">← Back to tsb playground</a></p>
97+
</body>
98+
</html>

‎playground/index.html‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,11 @@ <h3><a href="style.html" style="color: var(--accent); text-decoration: none;">
454454
<p>dataFrameStyle(df) · highlightMax / highlightMin / highlightNull / highlightBetween · backgroundGradient / textGradient · barChart · format / formatIndex · apply / applymap / map · setCaption / setTableStyles / hide · toHtml / toLatex. Mirrors pandas.DataFrame.style (Styler).</p>
455455
<div class="status done">✅ Complete</div>
456456
</div>
457+
<div class="feature-card">
458+
<h3><a href="hash_pandas_object.html" style="color: var(--accent); text-decoration: none;">🔑 hashPandasObject — FNV-1a Hashing</a></h3>
459+
<p>hashPandasObject(s) · hashPandasObject(df) · index option. Mirrors pandas.util.hash_pandas_object. FNV-1a 64-bit per element or row.</p>
460+
<div class="status done">✅ Complete</div>
461+
</div>
457462
</section>
458463
<div class="features-grid">
459464
<div class="feature-card">

‎src/index.ts‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,3 +683,5 @@ export type {
683683
GradientOptions,
684684
BarOptions,
685685
} from "./stats/index.ts";
686+
export { hashPandasObject } from "./stats/index.ts";
687+
export type { HashPandasObjectOptions } from "./stats/index.ts";

‎src/stats/hash_pandas_object.ts‎

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
/**
2+
* hash_pandas_object — FNV-1a 64-bit hashes for Series and DataFrame.
3+
*
4+
* Mirrors `pandas.util.hash_pandas_object`, which returns a `Series` of
5+
* `uint64` hash values — one per element (for a Series input) or one per row
6+
* (for a DataFrame input).
7+
*
8+
* Implementation uses FNV-1a 64-bit (Fowler–Noll–Vo) running on JavaScript
9+
* `BigInt` arithmetic. The result values are stored as `float64` (the only
10+
* numeric type available in the tsb dtype system) by converting the `uint64`
11+
* bit-pattern to `number` via `Number(bigint)`. For hash-equality checks this
12+
* is fine because every `uint64` value that differs will also differ as a
13+
* `float64` in the range 0 – 2**64-1 that we use.
14+
*
15+
* @example
16+
* ```ts
17+
* import { Series, DataFrame, hashPandasObject } from "tsb";
18+
*
19+
* const s = new Series({ data: [1, 2, 3], index: ["a", "b", "c"] });
20+
* const h = hashPandasObject(s);
21+
* // h is a Series<number> with hash values; equal inputs ⇒ equal hashes
22+
*
23+
* const df = new DataFrame({ a: [1, 2], b: ["x", "y"] });
24+
* const hr = hashPandasObject(df);
25+
* // hr has one hash per row
26+
* ```
27+
*
28+
* @module
29+
*/
30+
31+
import type { Scalar } from "../types.ts";
32+
import { DataFrame } from "../core/frame.ts";
33+
import { Series } from "../core/series.ts";
34+
35+
// ─── FNV-1a 64-bit constants ──────────────────────────────────────────────────
36+
37+
const FNV_PRIME = BigInt("0x00000100000001B3");
38+
const FNV_OFFSET = BigInt("0xcbf29ce484222325");
39+
const MASK64 = (BigInt(1) << BigInt(64)) - BigInt(1);
40+
41+
/** Hash a single byte into the running FNV-1a state. */
42+
function fnvByte(hash: bigint, byte: number): bigint {
43+
return ((hash ^ BigInt(byte)) * FNV_PRIME) & MASK64;
44+
}
45+
46+
/** Hash an arbitrary string (UTF-8 bytes) into the FNV state. */
47+
function fnvString(hash: bigint, s: string): bigint {
48+
for (let i = 0; i < s.length; i++) {
49+
let code = s.charCodeAt(i);
50+
// Encode as UTF-8 bytes
51+
if (code < 0x80) {
52+
hash = fnvByte(hash, code);
53+
} else if (code < 0x800) {
54+
hash = fnvByte(hash, 0xc0 | (code >> 6));
55+
hash = fnvByte(hash, 0x80 | (code & 0x3f));
56+
} else {
57+
hash = fnvByte(hash, 0xe0 | (code >> 12));
58+
hash = fnvByte(hash, 0x80 | ((code >> 6) & 0x3f));
59+
hash = fnvByte(hash, 0x80 | (code & 0x3f));
60+
}
61+
}
62+
return hash;
63+
}
64+
65+
/** Hash a single scalar value into the FNV state. */
66+
function fnvScalar(hash: bigint, val: Scalar): bigint {
67+
if (val === null || val === undefined) {
68+
// encode as a sentinel byte sequence
69+
return fnvByte(fnvByte(hash, 0xfe), 0xfe);
70+
}
71+
if (typeof val === "boolean") {
72+
return fnvByte(hash, val ? 1 : 0);
73+
}
74+
if (typeof val === "number") {
75+
if (Number.isNaN(val)) {
76+
return fnvByte(fnvByte(hash, 0xfd), 0xfd);
77+
}
78+
// Encode as little-endian 8-byte IEEE 754
79+
const buf = new ArrayBuffer(8);
80+
new DataView(buf).setFloat64(0, val, true);
81+
const bytes = new Uint8Array(buf);
82+
for (let i = 0; i < 8; i++) {
83+
hash = fnvByte(hash, bytes[i]!);
84+
}
85+
return hash;
86+
}
87+
if (typeof val === "bigint") {
88+
return fnvString(hash, val.toString());
89+
}
90+
if (val instanceof Date) {
91+
return fnvString(hash, String(val.getTime()));
92+
}
93+
// string or timedelta-like — stringify
94+
return fnvString(hash, String(val));
95+
}
96+
97+
// ─── Options ──────────────────────────────────────────────────────────────────
98+
99+
/** Options for {@link hashPandasObject}. */
100+
export interface HashPandasObjectOptions {
101+
/**
102+
* Whether to include the index in the hash. Default `true`.
103+
*
104+
* When `false`, two Series with different indexes but identical values will
105+
* produce the same hash values.
106+
*/
107+
index?: boolean;
108+
}
109+
110+
// ─── Series overload ──────────────────────────────────────────────────────────
111+
112+
/**
113+
* Return a `Series<number>` of FNV-1a 64-bit hash values for each element
114+
* of `s`. The result index matches `s.index`.
115+
*
116+
* Mirrors `pandas.util.hash_pandas_object` for a `Series` input.
117+
*
118+
* @param obj - A `Series` to hash.
119+
* @param options - Optional settings (see {@link HashPandasObjectOptions}).
120+
* @returns A `Series<number>` of hash values.
121+
*
122+
* @example
123+
* ```ts
124+
* const s = new Series({ data: ["a", "b", "a"], index: [0, 1, 2] });
125+
* const h = hashPandasObject(s);
126+
* h.iat(0) === h.iat(2); // true — same value → same hash
127+
* h.iat(0) !== h.iat(1); // true (with overwhelming probability)
128+
* ```
129+
*/
130+
export function hashPandasObject(
131+
obj: Series,
132+
options?: HashPandasObjectOptions,
133+
): Series<number>;
134+
135+
/**
136+
* Return a `Series<number>` of FNV-1a 64-bit row-hashes for each row of `df`.
137+
* The result index matches `df.index`.
138+
*
139+
* Mirrors `pandas.util.hash_pandas_object` for a `DataFrame` input.
140+
*
141+
* @param obj - A `DataFrame` to hash.
142+
* @param options - Optional settings (see {@link HashPandasObjectOptions}).
143+
* @returns A `Series<number>` of row hash values.
144+
*
145+
* @example
146+
* ```ts
147+
* const df = new DataFrame({ a: [1, 2], b: ["x", "y"] });
148+
* const h = hashPandasObject(df);
149+
* // h.iat(0) is the hash of row 0; h.iat(1) is the hash of row 1
150+
* ```
151+
*/
152+
export function hashPandasObject(
153+
obj: DataFrame,
154+
options?: HashPandasObjectOptions,
155+
): Series<number>;
156+
157+
export function hashPandasObject(
158+
obj: Series | DataFrame,
159+
options: HashPandasObjectOptions = {},
160+
): Series<number> {
161+
const includeIndex = options.index !== false;
162+
163+
if (obj instanceof Series) {
164+
return _hashSeries(obj, includeIndex);
165+
}
166+
return _hashDataFrame(obj, includeIndex);
167+
}
168+
169+
// ─── internal helpers ─────────────────────────────────────────────────────────
170+
171+
function _hashSeries(s: Series, includeIndex: boolean): Series<number> {
172+
const n = s.index.size;
173+
const hashes: number[] = [];
174+
175+
for (let i = 0; i < n; i++) {
176+
let h = FNV_OFFSET;
177+
if (includeIndex) {
178+
h = fnvScalar(h, s.index.at(i) as Scalar);
179+
// separator byte between index and value
180+
h = fnvByte(h, 0xff);
181+
}
182+
h = fnvScalar(h, s.iat(i));
183+
hashes.push(Number(h));
184+
}
185+
186+
return new Series<number>({ data: hashes, index: s.index, dtype: "float64" });
187+
}
188+
189+
function _hashDataFrame(df: DataFrame, includeIndex: boolean): Series<number> {
190+
const [nRows] = df.shape;
191+
const colNames = df.columns.values as readonly string[];
192+
const hashes: number[] = [];
193+
194+
for (let i = 0; i < nRows; i++) {
195+
let h = FNV_OFFSET;
196+
if (includeIndex) {
197+
h = fnvScalar(h, df.index.at(i) as Scalar);
198+
h = fnvByte(h, 0xff);
199+
}
200+
for (const name of colNames) {
201+
const s = df.col(name);
202+
h = fnvScalar(h, s.iat(i));
203+
h = fnvByte(h, 0xfe); // column separator
204+
}
205+
hashes.push(Number(h));
206+
}
207+
208+
return new Series<number>({ data: hashes, index: df.index, dtype: "float64" });
209+
}

‎src/stats/index.ts‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,3 +501,5 @@ export type {
501501
GradientOptions,
502502
BarOptions,
503503
} from "./style.ts";
504+
export { hashPandasObject } from "./hash_pandas_object.ts";
505+
export type { HashPandasObjectOptions } from "./hash_pandas_object.ts";

0 commit comments

Comments
 (0)