-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhash_table.py
104 lines (82 loc) · 3.75 KB
/
hash_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
'''
Source: Natural Language Processing with Classification and Vector Spaces, Week 4, Coursera Assignment
Description: Create Hash Table
'''
import numpy as np
# UNQ_C17 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def hash_value_of_vector(v, planes):
"""Create a hash for a vector; hash_id says which random hash to use.
Input:
- v: vector of tweet. It's dimension is (1, N_DIMS)
- planes: matrix of dimension (N_DIMS, N_PLANES) - the set of planes that divide up the region
Output:
- res: a number which is used as a hash for your vector
"""
# for the set of planes,
# calculate the dot product between the vector and the matrix containing the planes
# remember that planes has shape (300, 10)
# The dot product will have the shape (1,10)
dot_product = np.dot(v, planes)
# get the sign of the dot product (1,10) shaped vector
sign_of_dot_product = np.sign(dot_product)
# set h to be false (eqivalent to 0 when used in operations) if the sign is negative,
# and true (equivalent to 1) if the sign is positive (1,10) shaped vector
# if the sign is 0, i.e. the vector is in the plane, consider the sign to be positive
h = sign_of_dot_product>=0
# remove extra un-used dimensions (convert this from a 2D to a 1D array)
h = np.squeeze(h)
# initialize the hash value to 0
hash_value = 0
n_planes = planes.shape[1]
for i in range(n_planes):
# increment the hash value by 2^i * h_i
hash_value += np.power(2,i)*h[i]
# cast hash_value as an integer
hash_value = int(hash_value)
return hash_value
def make_hash_table(vecs, planes, hash_value_of_vector=hash_value_of_vector):
"""
Input:
- vecs: list of vectors to be hashed.
- planes: the matrix of planes in a single "universe", with shape (embedding dimensions, number of planes).
Output:
- hash_table: dictionary - keys are hashes, values are lists of vectors (hash buckets)
- id_table: dictionary - keys are hashes, values are list of vectors id's
(it's used to know which tweet corresponds to the hashed vector)
"""
# number of planes is the number of columns in the planes matrix
num_of_planes = planes.shape[1]
# number of buckets is 2^(number of planes)
# ALTERNATIVE SOLUTION COMMENT:
# num_buckets = pow(2, num_of_planes)
num_buckets = 2**num_of_planes
# create the hash table as a dictionary.
# Keys are integers (0,1,2.. number of buckets)
# Values are empty lists
hash_table = {i: [] for i in range(num_buckets)}
# create the id table as a dictionary.
# Keys are integers (0,1,2... number of buckets)
# Values are empty lists
id_table = {i: [] for i in range(num_buckets)}
# for each vector in 'vecs'
for i, v in enumerate(vecs):
# calculate the hash value for the vector
h = hash_value_of_vector(v, planes)
# store the vector into hash_table at key h,
# by appending the vector v to the list at key h
hash_table[h].append(v) # @REPLACE None
# store the vector's index 'i' (each document is given a unique integer 0,1,2...)
# the key is the h, and the 'i' is appended to the list at key h
id_table[h].append(i) # @REPLACE None
return hash_table, id_table
# Creating the hashtables
def create_hash_id_tables(n_universes, planes_l, document_vecs):
hash_tables = []
id_tables = []
for universe_id in range(n_universes): # there are 25 hashes
print('working on hash universe #:', universe_id)
planes = planes_l[universe_id]
hash_table, id_table = make_hash_table(document_vecs, planes)
hash_tables.append(hash_table)
id_tables.append(id_table)
return hash_tables, id_tables