*=======================================================================; * COOKING IN-MEMORY HASH TABLE LOOKUP FROM SCRATCH ; *-----------------------------------------------------------------------; * Specifically: Array hash table lookup with LINEAR PROBING. ; *-----------------------------------------------------------------------; * NOTE: Not hash object lookup or macro memory lookup. ; *-----------------------------------------------------------------------; * Based on _temporary_ arrays, as they are in memory but not in PDV. ; *=======================================================================; *==================================================; * Sample data from SASHELP.CLASS (19 observations) ; * Added variable ID: Integer, 3-digit ; *==================================================; data KD (keep = ID name age) ; if _n_ = 1 then call streaminit (1) ; retain ID 100 ; ID + rand ("integer", 13, 53) ; set sashelp.class ; run ; *==================================================; * LINEAR SEARCH (key: ID, data: AGE) ; *==================================================; data _null_ ; array K [0:19] _temporary_ ; array D [0:19] _temporary_ ; * load ; put " X K[x] D[x]" / 14 * "-" ; do x = 1 by 1 until (z) ; set kd end = z ; K[x] = ID ; D[x] = age ; put x 2. +1 (k[x] d[x]) (4.-R) ; end ; put ; * lookup ; do ID = 149, 502, 893, 000 ; call missing (name) ; K[0] = ID ; do x = hbound (k) by -1 until (k[x] = ID) ; end ; age = D[x] ; put ID=z3. x=z2. age= ; end ; stop ; run ; *======================================================; * LINEAR SEARCH: PROS AND CONS ; *======================================================; * Conceptually no-brainer ; * At N < 6, fastest key-comparison search ; *------------------------------------------------------; * Runs in O(N) time ; * N of key-comparisons proportional to N keys in table ; *======================================================; *======================================================; * BINARY SEARCH (key: ID, data: AGE) ; *======================================================; data _null_ ; array K [1:19] _temporary_ ; array D [1:19] _temporary_ ; put " X K[x] D[X]" / 14 * "-" ; * load ; do x = 1 by 1 until (z) ; set kd end = z ; K[x] = ID ; D[x] = age ; put x 2. +1 (k[x] d[x]) (4.-R) ; end ; put ; * lookup ; do ID = 149, 502, 893, 000 ; call missing (age, key_found) ; lo = lbound (k) ; hi = hbound (k) ; do until (lo > hi or key_found) ; x = floor ((lo + hi) / 2) ; if id < K[x] then hi = x - 1 ; else if id > K[x] then lo = x + 1 ; else key_found = 1 ; end ; if key_found then age = D[x] ; put ID=z3. x=z2. age= ; end ; stop ; run ; *================================================================; * BINARY SEARCH PROS: AND CONS ; *================================================================; * Runs in O(log2(N)) time ; * N of key comparisons is proportional to log2 (N keys in table) ; * For arbitrary key-values, fastest key-comparison search ; *================================================================; * Table must be sorted (already sorted in sample data) ; * More complex, finicky to code - easy to get an infinite loop ; *================================================================; *================================================================; * BOTH LINEAR AND BINARY are KEY-COMPARISON SEARCH ALGORITHMS ; *----------------------------------------------------------------; * For both, run-time grows with N ; *================================================================; * COULD WE DEVISE A SEARCH SCHEME SUCH THAT: ; *----------------------------------------------------------------; * 1. No comparisons between keys are needed ? ; * 2. Run time does not depend on number of keys in the table N ? ; *================================================================; *================================================================; * KEY-INDEXED SEARCH ; *================================================================; * ID key-values are 3-digit integers, i.e. range from 0 to 999. ; * [0:999] array has an index-value for every possible key-value. ; *----------------------------------------------------------------; * To load, for every key-value ID from KD, we can set D[X]=AGE ; * where X=ID. ; *----------------------------------------------------------------; * To search for an ID, we only need to inspect D[X] where X=ID. ; * If D[X] is null, ID is not in the table, else AGE=D[X]. ; *================================================================; data _null_ ; array D [0:999] _temporary_ ; * load ; do until (z) ; set kd end = z ; X = ID ; D[X] = age ; end ; * lookup ; do ID = 149, 502, 893, 000 ; X = ID ; AGE = D[x] ; put ID=z3. x=z3. age= ; end ; stop ; run ; *================================================================; * KEY-INDEXED SEARCH: Pros and Cons ; *================================================================; * Fastest existing search algorithm - theoretically unbeatable. ; * No comparisons between keys are necessary ; * Lookup speed does not depend on number of keys in the table N. ; * Actually practically applicable in many real-life scenarios. ; *================================================================; * Extreme speed is a trade-off for obscene memory utilization. ; * Table D is very sparse: There are 1000 slots for 19 keys. ; * Hence: Good for integer keys with limited range. ; * Example: SAS dates as keys. [-100000:100000] will cover it. ; * BUT NOT SAS datetimes! SSN requires 1E9-item array. Etc. ; *================================================================; *================================================================; * HASH TABLE SEARCH: THE GREAT COMPROMISE ; *================================================================; * Keep the central idea of key-indexing: Mapping keys to array ; * array indices based on digital properties of the keys. ; *----------------------------------------------------------------; * Make the table less sparse - e.g. approx 1.5-2 times N. ; *----------------------------------------------------------------; * For a set or arbitrary keys, there exists NO a priori rule to ; * guarantee one-to-one mapping of key-values to array indices. ; * More than 1 key-value will likely map to the same array index. ; *----------------------------------------------------------------; * To resolve such COLLISIONS, allow SOME comparisons between ; * keys to tell the keys mapping to the same index apart. ; *================================================================; *================================================================; * STEP BELOW IS JUST A KEY-MAPPING DEMO. ; *================================================================; * BUT it also shows uses a real COLLISION-RESOLUTION POLICY ; * called SEPARATE CHAINING. ; *================================================================; %let size = 29 ; * table size, approx 1.5*N (N=19). N / size = LOAD FACTOR ; data _null_ ; array K [&size, 0:5] _temporary_ ; do until (z) ; set kd end = z ; X = mod (ID, &size) + 1 ; * HASH FUNCTION: maps any integer between 1 and &size ; k[X, 0] + 1 ; chain_seq = k[X, 0] ; * Number of key-values mapped to this index X as yet ; k[X, chain_seq] = id ; end ; * show distribution of ID key-values among indices X ; do X = 1 to hbound (k, 1) ; put X=z2. k[X, 0]=1. @ ; do chain_seq = 1 to hbound (k, 2) ; put +1 k[X, chain_seq] z3. @ ; end ; put ; end ; * do ID = 149, 502, 893, 000 ; * ; * end ; run ; *================================================================; * You may want to try coding the lookup part absent from above ; * as a useful practical hash-coding etude. Note: you will need ; * to add another 2-dim array D[&size, 0:5] for the data AGE. ; *================================================================; *================================================================; * DEMO TAKEAWAYS ; *----------------------------------------------------------------; * For a hash table of a given load factor, run time is O(1). ; * Does not depend on the number of keys N in the hash table. ; *----------------------------------------------------------------; * We do not know what the 2nd dimension should be beforehand. ; * Thus, have to allocate as "big enough" or run twice to get it. ; * 2-dim array is workable but too hard on memory footprint. ; *----------------------------------------------------------------; * SAS hash object side note: The separate chains are organized ; * as AVL binary trees. ; *================================================================; *================================================================; * OPEN-ADDRESSING HASHING WITH LINEAR PROBING ; *================================================================; * Use 2 parallel 1-dim arrays K / D for keys(ID) / data (AGE). ; * Choose its size as the least prime number > 1.5*N. ; *----------------------------------------------------------------; * N / &size (=2/3 here( is termed a LOAD FACTOR. ; * The LOWER the load factor, the SPARSER the hash table. ; *----------------------------------------------------------------; * LOAD: ; * ---- ; * 1. Map ID to X as X = mod (ID, &size) + 1. ; * 4. Traverse K using X = X by 1 until (K[X]=. or K[X]=ID). ; * 2. If X > &size, set X=1, i.e. go to the beginning of K. ; * 3. If K[X]=., store K[X]=ID, D[X]=AGE. ; * 6. Else if K[X]=ID, key is a dupe. Get next ID and go to #1. ; *----------------------------------------------------------------; * SEARCH: ; * ------ ; * 1. Map ID to X as X = mod (ID, &size) + 1. ; * 2. Loop thru K using X = X by 1 until (K[X]=. or K[X]=ID). ; * 3. If X > &size, set X=1, i.e. go to the beginning of K. ; * 4. If K[X]=., the key is not in the table. Stop. ; * 5. Else key is found, so set AGE = D[X]. ; *================================================================; %let size = 29 ; * table size, approx 2*N (N=19) ; data _null_ ; array K [&size] _temporary_ ; array D [&size] _temporary_ ; put "------ X --> " @ ; do X = 1 to &size ; put x z2. @ +2 ; end ; put / 127 * "-" ; * load ; do until (z) ; set kd end = z ; do x = mod (ID, &size) + 1 by 1 until (K[x]=. or K[x]=ID) ; if x > &size then x = 1 ; end ; if missing (K[x]) then do ; K[x] = ID ; D[x] = AGE ; end ; put ID=z3. X=z2. @ ; do x = 1 to &size ; put K[x] z3. @ +1 ; end ; put ; end ; put ; * lookup ; do ID = 149, 502, 893, 000 ; call missing (age) ; do x = mod (ID, &size) + 1 by 1 until (K[x]=. or K[x]=ID) ; if x > &size then x = 1 ; end ; if not missing (K[x]) then age = D[x] ; put ID=z3. x=z2. age= ; end ; stop ; run ; *================================================================; * REDUCING PRIMARY CLUSTERING ; *================================================================; * Use a prime increment step > 1, e.g. 3, 5, 7, etc. ; *----------------------------------------------------------------; * Use variable prime step depending on key (DOUBLE HASHING). ; * -- Not covered here. ; *================================================================; %let size = 29 ; * table size, approx 2*N (N=19) ; data _null_ ; array K [&size] _temporary_ ; array D [&size] _temporary_ ; put "------ X --> " @ ; do X = 1 to &size ; put x z2. @ +2 ; end ; put / 127 * "-" ; * load ; do until (z) ; set kd end = z ; link traverse ; if missing (K[x]) then do ; K[x] = ID ; D[x] = AGE ; end ; put ID=z3. X=z2. @ ; do x = 1 to &size ; put K[x] z3. @ +1 ; end ; put ; end ; put ; * lookup ; do ID = 149, 502, 893, 000 ; call missing (age) ; link traverse ; if not missing (K[x]) then age = D[x] ; put ID=z3. x=z2. age= ; end ; stop ; traverse: retain step 7 ; do x = mod (ID, &size) + 1 by step until (K[x]=. or K[x]=ID) ; if x > &size then x = &size - x + 1 + step ; end ; run ; *================================================================; * HASHING NON-INTEGER KEYS: CHARACTER KEYS ; *================================================================; * Need to obtain a numeric equivalent of character key to be ; * in the MOD function. ; *----------------------------------------------------------------; * Getting a full equivalent is mostly impossible since a SAS ; * string is basically a 256-radix number. ; *----------------------------------------------------------------; * Good fast approximation is converting first 6 characters using ; * the PIB6. informat. ; *----------------------------------------------------------------; * Alternative: Using PIB6. against MD5(key). Not as fast but: ; * - Involves all characters in the hash function ; * - Thus yields guaranteed good random distribution ; *================================================================; * Suppose now that our hash key is NAME ($8) instead of ID: ; * Key: NAME, Data: AGE ; *================================================================; %let size = 29 ; * table size, approx 2*N (N=19) ; data _null_ ; array K [&size] $ 8 _temporary_ ; array D [&size] _temporary_ ; * load ; do until (z) ; set kd end = z ; link traverse ; if cmiss (K[x]) then do ; K[x] = name ; D[x] = age ; end ; end ; * lookup ; do name = "Alfred", "Jeffrey", "William", "Quentin" ; call missing (age) ; link traverse ; if not cmiss (K[x]) then age = D[x] ; put name= @14 age= ; end ; stop ; traverse: * do x = mod (input ( name , pib6.), &size) + 1 by 1 until (cmiss (K[x]) or K[x]=name) ; do x = mod (input (MD5 (name), pib6.), &size) + 1 by 1 until (cmiss (K[x]) or K[x]=name) ; if x > &size then x = 1 ; end ; run ; *================================================================; * HASHING COMPOSITE KEYS ; *================================================================; * 1. Cat the keys: C = CATX (, K1, K2, ...) ; * 2. Plug CAT into MD5: H = MD5 (C) ; * 3. Plug CATMD5 into INPUT: HN = INPUT (H, PIB6.) ; * 4. Plug CATMD5IN into MOD: X = MOD (HN, &size) + 1 ; *----------------------------------------------------------------; * Store H ($16) in an array K (instead of 2 arrays) as key-value ; *----------------------------------------------------------------; * Suppose now that our hash key is [ID(8), NAME($8)] ; * Key: [ID, NAME], Data: AGE ; *================================================================; %let size = 29 ; * table size, approx 2*N (N=19) ; data _null_ ; array K [&size] $ 16 _temporary_ ; * store H=MD5(CATx(":",ID,name)) here ; array D [&size] _temporary_ ; * load ; do until (z) ; set kd end = z ; link traverse ; if cmiss (K[x]) then do ; K[x] = H ; D[x] = age ; end ; end ; * lookup ; do ID = 149, 502, 893, 000 ; select (id) ; when (149) name = "Alfred" ; when (502) name = "Jeffrey" ; when (893) name = "William" ; otherwise name = "Quentin" ; end ; call missing (age) ; link traverse ; if not cmiss (K[x]) then age = D[x] ; put ID=z3. name= @21 age= ; end ; stop ; traverse: H = put (md5 (catx (":", ID, name)), $16.) ; do x = mod (input (H, pib6.), &size) + 1 by 1 until (cmiss (K[x]) or K[x]=H) ; if x > &size then x = 1 ; end ; run ;