File size: 16,269 Bytes
dbb04e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3a3710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbb04e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3a3710
dbb04e4
 
 
 
c3a3710
 
dbb04e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3a3710
 
dbb04e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3a3710
dbb04e4
 
 
7c8b011
 
dbb04e4
 
 
 
 
 
c3a3710
 
 
 
 
 
 
 
 
 
dbb04e4
c3a3710
 
 
 
 
 
 
dbb04e4
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
"""

Binary Hyperdimensional Vector (Binary HDV) Core

=================================================

Phase 3.0 implementation of binary VSA operations.



Based on Kanerva's Hyperdimensional Computing theory (2009).

Uses standard mathematical operations (XOR, Hamming distance, majority bundling)

that are fundamental VSA primitives β€” not derived from any proprietary implementation.



Key design choices:

  - D = 16,384 bits (2^14) β€” configurable via config.yaml

  - Storage: packed as np.uint8 arrays (D/8 bytes = 2,048 bytes per vector)

  - Similarity: Hamming distance (popcount of XOR result)

  - Binding: element-wise XOR (self-inverse, commutative)

  - Bundling: element-wise majority vote (thresholded sum)

  - Sequence: circular bit-shift (permutation)



All batch operations are NumPy-vectorized (no Python loops for distance computation).

"""

import hashlib
from typing import List, Optional, Tuple

import numpy as np
import re


# Cached lookup table for popcount (bits set per byte value 0-255)
_POPCOUNT_TABLE: Optional[np.ndarray] = None


def _build_popcount_table() -> np.ndarray:
    """Build or return cached popcount lookup table for bytes (0-255)."""
    global _POPCOUNT_TABLE
    if _POPCOUNT_TABLE is None:
        _POPCOUNT_TABLE = np.array(
            [bin(i).count("1") for i in range(256)], dtype=np.int32
        )
    return _POPCOUNT_TABLE


class BinaryHDV:
    """

    A binary hyperdimensional vector stored as a packed uint8 array.



    The vector has `dimension` logical bits, stored in `dimension // 8` bytes.

    Each byte holds 8 bits in big-endian bit order (MSB first within each byte).



    Attributes:

        data: np.ndarray of dtype uint8, shape (dimension // 8,)

        dimension: int, number of logical bits

    """

    __slots__ = ("data", "dimension")

    def __init__(self, data: np.ndarray, dimension: int):
        """

        Args:

            data: Packed uint8 array of shape (dimension // 8,).

            dimension: Number of logical bits.

        """
        assert data.dtype == np.uint8, f"Expected uint8, got {data.dtype}"
        assert data.shape == (dimension // 8,), (
            f"Shape mismatch: expected ({dimension // 8},), got {data.shape}"
        )
        self.data = data
        self.dimension = dimension

    # ------------------------------------------------------------------
    # Factory methods
    # ------------------------------------------------------------------

    @classmethod
    def random(cls, dimension: int = 16384) -> "BinaryHDV":
        """Generate a random binary vector (uniform i.i.d. bits)."""
        assert dimension % 8 == 0, "Dimension must be multiple of 8"
        n_bytes = dimension // 8
        data = np.random.randint(0, 256, size=n_bytes, dtype=np.uint8)
        return cls(data=data, dimension=dimension)

    @classmethod
    def zeros(cls, dimension: int = 16384) -> "BinaryHDV":
        """All-zero vector."""
        n_bytes = dimension // 8
        return cls(data=np.zeros(n_bytes, dtype=np.uint8), dimension=dimension)

    @classmethod
    def ones(cls, dimension: int = 16384) -> "BinaryHDV":
        """All-one vector (every bit set)."""
        n_bytes = dimension // 8
        return cls(
            data=np.full(n_bytes, 0xFF, dtype=np.uint8), dimension=dimension
        )

    @classmethod
    def from_seed(cls, seed: str, dimension: int = 16384) -> "BinaryHDV":
        """

        Deterministic vector from a string seed.

        Uses SHA-3 (SHAKE-256) for high-performance deterministic expansion.

        """
        n_bytes = dimension // 8
        # SHAKE-256 can generate arbitrary length digests in one pass
        digest = hashlib.shake_256(seed.encode()).digest(n_bytes)
        data = np.frombuffer(digest, dtype=np.uint8).copy()
        return cls(data=data, dimension=dimension)

    # ------------------------------------------------------------------
    # Core VSA operations
    # ------------------------------------------------------------------

    def xor_bind(self, other: "BinaryHDV") -> "BinaryHDV":
        """

        Binding via element-wise XOR.



        Properties:

          - Self-inverse: a βŠ• a = 0

          - Commutative: a βŠ• b = b βŠ• a

          - Associative: (a βŠ• b) βŠ• c = a βŠ• (b βŠ• c)

          - Preserves distance: hamming(aβŠ•c, bβŠ•c) = hamming(a, b)

        """
        assert self.dimension == other.dimension
        return BinaryHDV(
            data=np.bitwise_xor(self.data, other.data),
            dimension=self.dimension,
        )

    def permute(self, shift: int = 1) -> "BinaryHDV":
        """

        Circular bit-shift for sequence/role encoding.



        Shifts all bits by `shift` positions to the right (with wrap-around).

        Works at the byte level with bit carry for efficiency.

        """
        if shift == 0:
            return BinaryHDV(data=self.data.copy(), dimension=self.dimension)

        # Normalize shift to positive value within dimension
        shift = shift % self.dimension

        bits = np.unpackbits(self.data)
        bits = np.roll(bits, shift)
        return BinaryHDV(
            data=np.packbits(bits), dimension=self.dimension
        )

    def invert(self) -> "BinaryHDV":
        """Bitwise NOT β€” produces the maximally distant vector."""
        return BinaryHDV(
            data=np.bitwise_not(self.data), dimension=self.dimension
        )

    def hamming_distance(self, other: "BinaryHDV") -> int:
        """

        Hamming distance: count of differing bits.



        Uses lookup table for speed (replacing unpackbits).

        Range: [0, dimension].

        """
        assert self.dimension == other.dimension
        xor_result = np.bitwise_xor(self.data, other.data)
        # Optimized: use precomputed popcount table instead of unpacking bits
        return int(_build_popcount_table()[xor_result].sum())

    def normalized_distance(self, other: "BinaryHDV") -> float:
        """Hamming distance normalized to [0.0, 1.0]."""
        return self.hamming_distance(other) / self.dimension

    def similarity(self, other: "BinaryHDV") -> float:
        """

        Similarity score in [0.0, 1.0].

        1.0 = identical, 0.0 = maximally different.

        0.5 = random/orthogonal (expected for unrelated vectors).

        """
        return 1.0 - self.normalized_distance(other)

    # ------------------------------------------------------------------
    # Compatibility shims for legacy HDV API
    # ------------------------------------------------------------------

    def bind(self, other: "BinaryHDV") -> "BinaryHDV":
        """

        Alias for xor_bind(). Compatibility shim for legacy HDV API.



        Deprecated: Use xor_bind() directly for new code.

        """
        return self.xor_bind(other)

    def unbind(self, other: "BinaryHDV") -> "BinaryHDV":
        """

        Alias for xor_bind(). Since XOR is self-inverse, unbind = bind.



        Compatibility shim for legacy HDV API.

        """
        return self.xor_bind(other)

    def cosine_similarity(self, other: "BinaryHDV") -> float:
        """

        Alias for similarity(). Compatibility shim for legacy HDV API.



        Note: For binary vectors, this returns Hamming-based similarity,

        not true cosine similarity. The values are comparable for most use cases.

        """
        return self.similarity(other)

    def normalize(self) -> "BinaryHDV":
        """

        No-op for binary vectors. Compatibility shim for legacy HDV API.



        Binary vectors are already "normalized" in the sense that they

        consist only of 0s and 1s. Returns a copy of the vector.

        """
        return BinaryHDV(data=self.data.copy(), dimension=self.dimension)

    def __xor__(self, other: "BinaryHDV") -> "BinaryHDV":
        """Alias for xor_bind(). Enables v1 ^ v2 syntax."""
        return self.xor_bind(other)

    def to_bytes(self) -> bytes:
        """Serialize to raw bytes (for storage)."""
        return self.data.tobytes()

    @classmethod
    def from_bytes(cls, raw: bytes, dimension: int = 16384) -> "BinaryHDV":
        """Deserialize from raw bytes."""
        data = np.frombuffer(raw, dtype=np.uint8).copy()
        return cls(data=data, dimension=dimension)

    def __repr__(self) -> str:
        # Optimized: use precomputed popcount table
        popcount = int(_build_popcount_table()[self.data].sum())
        return f"BinaryHDV(dim={self.dimension}, popcount={popcount}/{self.dimension})"

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, BinaryHDV):
            return NotImplemented
        return self.dimension == other.dimension and np.array_equal(
            self.data, other.data
        )


# ======================================================================
# Batch operations (NumPy-vectorized, no Python loops)
# ======================================================================


def batch_hamming_distance(

    query: BinaryHDV, database: np.ndarray

) -> np.ndarray:
    """

    Compute Hamming distance between a query vector and all vectors in a database.



    Args:

        query: Single BinaryHDV query vector.

        database: 2D array of shape (N, D//8) with dtype uint8, where each row

                  is a packed binary vector.



    Returns:

        1D array of shape (N,) with Hamming distances (int).

    """
    # XOR query with all database vectors: (N, D//8)
    xor_result = np.bitwise_xor(database, query.data)

    # Popcount via lookup table β€” count bits set in each byte
    # This is the fastest pure-NumPy approach for packed binary vectors
    popcount_table = _build_popcount_table()
    bit_counts = popcount_table[xor_result]  # (N, D//8)

    # Sum across bytes to get total Hamming distance per vector
    return bit_counts.sum(axis=1)


def batch_hamming_distance_matrix(

    database: np.ndarray,

) -> np.ndarray:
    """

    Compute the full pairwise Hamming distance matrix for a database.



    Args:

        database: 2D array of shape (N, D//8) with dtype uint8.



    Returns:

        2D array of shape (N, N) with Hamming distances.

    """
    N = database.shape[0]
    popcount_table = _build_popcount_table()
    distances = np.zeros((N, N), dtype=np.int32)

    for i in range(N):
        xor_result = np.bitwise_xor(database[i], database[i + 1 :])
        bit_counts = popcount_table[xor_result].sum(axis=1)
        distances[i, i + 1 :] = bit_counts
        distances[i + 1 :, i] = bit_counts

    return distances


def majority_bundle(

    vectors: List[BinaryHDV], randomize_ties: bool = False

) -> BinaryHDV:
    """

    Bundle multiple vectors via element-wise majority vote.



    For each bit position, the result bit is 1 if more than half of the

    input vectors have a 1 at that position.



    Args:

        vectors: List of BinaryHDV vectors to bundle.

        randomize_ties: If True, break ties randomly. If False (default),

                        ties default to 0 for deterministic results.



    This is the standard VSA bundling operation (superposition).

    """
    assert len(vectors) > 0, "Cannot bundle empty list"
    dimension = vectors[0].dimension

    # Unpack all vectors to bits
    # Optimization: Stack packed data first, then unpack all at once
    # This avoids K calls to unpackbits and list comprehension overhead
    packed_data = np.stack([v.data for v in vectors], axis=0)  # (K, D//8)
    all_bits = np.unpackbits(packed_data, axis=1)  # (K, D)

    # Sum along vectors axis: count of 1-bits per position
    sums = all_bits.sum(axis=0)  # (D,)

    # Majority vote: > half means 1
    threshold = len(vectors) / 2.0

    result_bits = np.zeros(dimension, dtype=np.uint8)
    result_bits[sums > threshold] = 1

    # Handle ties
    if randomize_ties:
        ties = sums == threshold
        if ties.any():
            result_bits[ties] = np.random.randint(
                0, 2, size=ties.sum(), dtype=np.uint8
            )

    return BinaryHDV(data=np.packbits(result_bits), dimension=dimension)


def top_k_nearest(

    query: BinaryHDV, database: np.ndarray, k: int = 10

) -> List[Tuple[int, int]]:
    """

    Find k nearest neighbors by Hamming distance.



    Args:

        query: Query vector.

        database: 2D array of shape (N, D//8) packed binary vectors.

        k: Number of nearest neighbors.



    Returns:

        List of (index, distance) tuples, sorted by distance ascending.

    """
    distances = batch_hamming_distance(query, database)
    k = min(k, len(distances))

    # argpartition is O(N) vs O(N log N) for full sort β€” much faster for large N
    indices = np.argpartition(distances, k)[:k]
    selected_distances = distances[indices]

    # Sort the k results by distance
    sort_order = np.argsort(selected_distances)
    sorted_indices = indices[sort_order]
    sorted_distances = selected_distances[sort_order]

    return [(int(idx), int(dist)) for idx, dist in zip(sorted_indices, sorted_distances)]


# ======================================================================
# Text encoding pipeline
# ======================================================================


class TextEncoder:
    """

    Encode text to binary HDV using token-level random vectors with

    position-permutation binding.



    Method: For text "hello world", we compute:

        HDV = bundle(token("hello") βŠ• permute(pos, 0),

                     token("world") βŠ• permute(pos, 1))



    Token vectors are deterministic (seeded from the token string),

    ensuring the same word always maps to the same base vector.

    """

    def __init__(self, dimension: int = 16384):
        self.dimension = dimension
        self._token_cache: dict[str, BinaryHDV] = {}

    def get_token_vector(self, token: str) -> BinaryHDV:
        """Get or create a deterministic vector for a token."""
        if token not in self._token_cache:
            self._token_cache[token] = BinaryHDV.from_seed(token, self.dimension)
        return self._token_cache[token]

    def encode(self, text: str) -> BinaryHDV:
        """

        Encode a text string to a binary HDV.



        Tokenization: simple whitespace split after normalization.

        Each token is bound with its position via XOR(token, permute(position_marker, i)).

        All position-bound tokens are bundled via majority vote.

        """
        # Improved Tokenization: consistent alphanumeric extraction
        tokens = re.findall(r'\b\w+\b', text.lower())
        if not tokens:
            return BinaryHDV.random(self.dimension)

        if len(tokens) == 1:
            return self.get_token_vector(tokens[0])

        # Build position-bound token vectors (#27)
        # Optimized: Batch process data instead of multiple object instantiations
        token_hdvs = [self.get_token_vector(t) for t in tokens]
        packed_data = np.stack([v.data for v in token_hdvs], axis=0)
        all_bits = np.unpackbits(packed_data, axis=1)

        # Apply position-based permutations (roll)
        for i in range(len(tokens)):
            if i > 0:
                all_bits[i] = np.roll(all_bits[i], i)

        # Vectorized majority vote (equivalent to majority_bundle)
        sums = all_bits.sum(axis=0)
        threshold = len(tokens) / 2.0
        result_bits = np.zeros(self.dimension, dtype=np.uint8)
        result_bits[sums > threshold] = 1

        return BinaryHDV(data=np.packbits(result_bits), dimension=self.dimension)

    def encode_with_context(

        self, text: str, context_hdv: BinaryHDV

    ) -> BinaryHDV:
        """

        Encode text and bind it with a context vector.



        Result = encode(text) βŠ• context

        This creates an association between the content and its context.

        """
        content_hdv = self.encode(text)
        return content_hdv.xor_bind(context_hdv)