|
| 1 | +/** |
| 2 | + * hash_pandas_object — FNV-1a 64-bit hashes for Series and DataFrame. |
| 3 | + * |
| 4 | + * Mirrors `pandas.util.hash_pandas_object`, which returns a `Series` of |
| 5 | + * `uint64` hash values — one per element (for a Series input) or one per row |
| 6 | + * (for a DataFrame input). |
| 7 | + * |
| 8 | + * Implementation uses FNV-1a 64-bit (Fowler–Noll–Vo) running on JavaScript |
| 9 | + * `BigInt` arithmetic. The result values are stored as `float64` (the only |
| 10 | + * numeric type available in the tsb dtype system) by converting the `uint64` |
| 11 | + * bit-pattern to `number` via `Number(bigint)`. For hash-equality checks this |
| 12 | + * is fine because every `uint64` value that differs will also differ as a |
| 13 | + * `float64` in the range 0 – 2**64-1 that we use. |
| 14 | + * |
| 15 | + * @example |
| 16 | + * ```ts |
| 17 | + * import { Series, DataFrame, hashPandasObject } from "tsb"; |
| 18 | + * |
| 19 | + * const s = new Series({ data: [1, 2, 3], index: ["a", "b", "c"] }); |
| 20 | + * const h = hashPandasObject(s); |
| 21 | + * // h is a Series<number> with hash values; equal inputs ⇒ equal hashes |
| 22 | + * |
| 23 | + * const df = new DataFrame({ a: [1, 2], b: ["x", "y"] }); |
| 24 | + * const hr = hashPandasObject(df); |
| 25 | + * // hr has one hash per row |
| 26 | + * ``` |
| 27 | + * |
| 28 | + * @module |
| 29 | + */ |
| 30 | + |
| 31 | +import type { Scalar } from "../types.ts"; |
| 32 | +import { DataFrame } from "../core/frame.ts"; |
| 33 | +import { Series } from "../core/series.ts"; |
| 34 | + |
| 35 | +// ─── FNV-1a 64-bit constants ────────────────────────────────────────────────── |
| 36 | + |
| 37 | +const FNV_PRIME = BigInt("0x00000100000001B3"); |
| 38 | +const FNV_OFFSET = BigInt("0xcbf29ce484222325"); |
| 39 | +const MASK64 = (BigInt(1) << BigInt(64)) - BigInt(1); |
| 40 | + |
| 41 | +/** Hash a single byte into the running FNV-1a state. */ |
| 42 | +function fnvByte(hash: bigint, byte: number): bigint { |
| 43 | + return ((hash ^ BigInt(byte)) * FNV_PRIME) & MASK64; |
| 44 | +} |
| 45 | + |
| 46 | +/** Hash an arbitrary string (UTF-8 bytes) into the FNV state. */ |
| 47 | +function fnvString(hash: bigint, s: string): bigint { |
| 48 | + for (let i = 0; i < s.length; i++) { |
| 49 | + let code = s.charCodeAt(i); |
| 50 | + // Encode as UTF-8 bytes |
| 51 | + if (code < 0x80) { |
| 52 | + hash = fnvByte(hash, code); |
| 53 | + } else if (code < 0x800) { |
| 54 | + hash = fnvByte(hash, 0xc0 | (code >> 6)); |
| 55 | + hash = fnvByte(hash, 0x80 | (code & 0x3f)); |
| 56 | + } else { |
| 57 | + hash = fnvByte(hash, 0xe0 | (code >> 12)); |
| 58 | + hash = fnvByte(hash, 0x80 | ((code >> 6) & 0x3f)); |
| 59 | + hash = fnvByte(hash, 0x80 | (code & 0x3f)); |
| 60 | + } |
| 61 | + } |
| 62 | + return hash; |
| 63 | +} |
| 64 | + |
| 65 | +/** Hash a single scalar value into the FNV state. */ |
| 66 | +function fnvScalar(hash: bigint, val: Scalar): bigint { |
| 67 | + if (val === null || val === undefined) { |
| 68 | + // encode as a sentinel byte sequence |
| 69 | + return fnvByte(fnvByte(hash, 0xfe), 0xfe); |
| 70 | + } |
| 71 | + if (typeof val === "boolean") { |
| 72 | + return fnvByte(hash, val ? 1 : 0); |
| 73 | + } |
| 74 | + if (typeof val === "number") { |
| 75 | + if (Number.isNaN(val)) { |
| 76 | + return fnvByte(fnvByte(hash, 0xfd), 0xfd); |
| 77 | + } |
| 78 | + // Encode as little-endian 8-byte IEEE 754 |
| 79 | + const buf = new ArrayBuffer(8); |
| 80 | + new DataView(buf).setFloat64(0, val, true); |
| 81 | + const bytes = new Uint8Array(buf); |
| 82 | + for (let i = 0; i < 8; i++) { |
| 83 | + hash = fnvByte(hash, bytes[i]!); |
| 84 | + } |
| 85 | + return hash; |
| 86 | + } |
| 87 | + if (typeof val === "bigint") { |
| 88 | + return fnvString(hash, val.toString()); |
| 89 | + } |
| 90 | + if (val instanceof Date) { |
| 91 | + return fnvString(hash, String(val.getTime())); |
| 92 | + } |
| 93 | + // string or timedelta-like — stringify |
| 94 | + return fnvString(hash, String(val)); |
| 95 | +} |
| 96 | + |
| 97 | +// ─── Options ────────────────────────────────────────────────────────────────── |
| 98 | + |
| 99 | +/** Options for {@link hashPandasObject}. */ |
| 100 | +export interface HashPandasObjectOptions { |
| 101 | + /** |
| 102 | + * Whether to include the index in the hash. Default `true`. |
| 103 | + * |
| 104 | + * When `false`, two Series with different indexes but identical values will |
| 105 | + * produce the same hash values. |
| 106 | + */ |
| 107 | + index?: boolean; |
| 108 | +} |
| 109 | + |
| 110 | +// ─── Series overload ────────────────────────────────────────────────────────── |
| 111 | + |
| 112 | +/** |
| 113 | + * Return a `Series<number>` of FNV-1a 64-bit hash values for each element |
| 114 | + * of `s`. The result index matches `s.index`. |
| 115 | + * |
| 116 | + * Mirrors `pandas.util.hash_pandas_object` for a `Series` input. |
| 117 | + * |
| 118 | + * @param obj - A `Series` to hash. |
| 119 | + * @param options - Optional settings (see {@link HashPandasObjectOptions}). |
| 120 | + * @returns A `Series<number>` of hash values. |
| 121 | + * |
| 122 | + * @example |
| 123 | + * ```ts |
| 124 | + * const s = new Series({ data: ["a", "b", "a"], index: [0, 1, 2] }); |
| 125 | + * const h = hashPandasObject(s); |
| 126 | + * h.iat(0) === h.iat(2); // true — same value → same hash |
| 127 | + * h.iat(0) !== h.iat(1); // true (with overwhelming probability) |
| 128 | + * ``` |
| 129 | + */ |
| 130 | +export function hashPandasObject( |
| 131 | + obj: Series, |
| 132 | + options?: HashPandasObjectOptions, |
| 133 | +): Series<number>; |
| 134 | + |
| 135 | +/** |
| 136 | + * Return a `Series<number>` of FNV-1a 64-bit row-hashes for each row of `df`. |
| 137 | + * The result index matches `df.index`. |
| 138 | + * |
| 139 | + * Mirrors `pandas.util.hash_pandas_object` for a `DataFrame` input. |
| 140 | + * |
| 141 | + * @param obj - A `DataFrame` to hash. |
| 142 | + * @param options - Optional settings (see {@link HashPandasObjectOptions}). |
| 143 | + * @returns A `Series<number>` of row hash values. |
| 144 | + * |
| 145 | + * @example |
| 146 | + * ```ts |
| 147 | + * const df = new DataFrame({ a: [1, 2], b: ["x", "y"] }); |
| 148 | + * const h = hashPandasObject(df); |
| 149 | + * // h.iat(0) is the hash of row 0; h.iat(1) is the hash of row 1 |
| 150 | + * ``` |
| 151 | + */ |
| 152 | +export function hashPandasObject( |
| 153 | + obj: DataFrame, |
| 154 | + options?: HashPandasObjectOptions, |
| 155 | +): Series<number>; |
| 156 | + |
| 157 | +export function hashPandasObject( |
| 158 | + obj: Series | DataFrame, |
| 159 | + options: HashPandasObjectOptions = {}, |
| 160 | +): Series<number> { |
| 161 | + const includeIndex = options.index !== false; |
| 162 | + |
| 163 | + if (obj instanceof Series) { |
| 164 | + return _hashSeries(obj, includeIndex); |
| 165 | + } |
| 166 | + return _hashDataFrame(obj, includeIndex); |
| 167 | +} |
| 168 | + |
| 169 | +// ─── internal helpers ───────────────────────────────────────────────────────── |
| 170 | + |
| 171 | +function _hashSeries(s: Series, includeIndex: boolean): Series<number> { |
| 172 | + const n = s.index.size; |
| 173 | + const hashes: number[] = []; |
| 174 | + |
| 175 | + for (let i = 0; i < n; i++) { |
| 176 | + let h = FNV_OFFSET; |
| 177 | + if (includeIndex) { |
| 178 | + h = fnvScalar(h, s.index.at(i) as Scalar); |
| 179 | + // separator byte between index and value |
| 180 | + h = fnvByte(h, 0xff); |
| 181 | + } |
| 182 | + h = fnvScalar(h, s.iat(i)); |
| 183 | + hashes.push(Number(h)); |
| 184 | + } |
| 185 | + |
| 186 | + return new Series<number>({ data: hashes, index: s.index, dtype: "float64" }); |
| 187 | +} |
| 188 | + |
| 189 | +function _hashDataFrame(df: DataFrame, includeIndex: boolean): Series<number> { |
| 190 | + const [nRows] = df.shape; |
| 191 | + const colNames = df.columns.values as readonly string[]; |
| 192 | + const hashes: number[] = []; |
| 193 | + |
| 194 | + for (let i = 0; i < nRows; i++) { |
| 195 | + let h = FNV_OFFSET; |
| 196 | + if (includeIndex) { |
| 197 | + h = fnvScalar(h, df.index.at(i) as Scalar); |
| 198 | + h = fnvByte(h, 0xff); |
| 199 | + } |
| 200 | + for (const name of colNames) { |
| 201 | + const s = df.col(name); |
| 202 | + h = fnvScalar(h, s.iat(i)); |
| 203 | + h = fnvByte(h, 0xfe); // column separator |
| 204 | + } |
| 205 | + hashes.push(Number(h)); |
| 206 | + } |
| 207 | + |
| 208 | + return new Series<number>({ data: hashes, index: df.index, dtype: "float64" }); |
| 209 | +} |
0 commit comments