*=======================================================================;
* COOKING IN-MEMORY HASH TABLE LOOKUP FROM SCRATCH                      ;
*-----------------------------------------------------------------------;
* Specifically: Array hash table lookup with LINEAR PROBING.            ;
*-----------------------------------------------------------------------;
* NOTE: Not hash object lookup or macro memory lookup.                  ;
*-----------------------------------------------------------------------;
* Based on _temporary_ arrays, as they are in memory but not in PDV.    ;
*=======================================================================;

*==================================================;
* Sample data from SASHELP.CLASS (19 observations) ;
* Added variable ID: Integer, 3-digit              ;
*==================================================;
data KD (keep = ID name age) ;
  if _n_ = 1 then call streaminit (1) ;
  retain ID 100 ;
  ID + rand ("integer", 13, 53) ;
  set sashelp.class ;
run ;

*==================================================;
* LINEAR SEARCH (key: ID, data: AGE)               ;
*==================================================;
data _null_ ;
  array K [0:19] _temporary_ ;
  array D [0:19] _temporary_ ;
* load ;
  put " X  K[x] D[x]" / 14 * "-" ;
  do x = 1 by 1 until (z) ;
    set kd end = z ;
    K[x] = ID  ;
    D[x] = age ;
    put x 2. +1 (k[x] d[x]) (4.-R) ;
  end ;
  put ;
* lookup ;
  do ID = 149, 502, 893, 000 ;
    call missing (name) ;
    K[0] = ID ;
    do x = hbound (k) by -1 until (k[x] = ID) ;
    end ;
    age = D[x] ;
    put ID=z3. x=z2. age= ;
  end ;
  stop ;
run ;

*======================================================;
* LINEAR SEARCH: PROS AND CONS                         ;
*======================================================;
* Conceptually no-brainer                              ;
* At N < 6, fastest key-comparison search              ;
*------------------------------------------------------;
* Runs in O(N) time                                    ;
* N of key-comparisons proportional to N keys in table ;
*======================================================;

*======================================================;
* BINARY SEARCH (key: ID, data: AGE)                   ;
*======================================================;
data _null_ ;
  array K [1:19] _temporary_ ;
  array D [1:19] _temporary_ ;
  put " X  K[x] D[X]" / 14 * "-" ;
* load ;
  do x = 1 by 1 until (z) ;
    set kd end = z ;
    K[x] = ID  ;
    D[x] = age ;
    put x 2. +1 (k[x] d[x]) (4.-R) ;
  end ;
  put ;
* lookup ;
  do ID = 149, 502, 893, 000 ;
    call missing (age, key_found) ;
    lo = lbound (k) ;
    hi = hbound (k) ;
    do until (lo > hi or key_found) ;
      x = floor ((lo + hi) / 2) ;
      if      id < K[x] then hi = x - 1 ;
      else if id > K[x] then lo = x + 1 ;
      else key_found = 1 ;
    end ;
    if key_found then age = D[x] ;
    put ID=z3. x=z2. age= ;
  end ;
  stop ;
run ;

*================================================================;
* BINARY SEARCH PROS: AND CONS                                   ;
*================================================================;
* Runs in O(log2(N)) time                                        ;
* N of key comparisons is proportional to log2 (N keys in table) ;
* For arbitrary key-values, fastest key-comparison search        ;
*================================================================;
* Table must be sorted (already sorted in sample data)           ;
* More complex, finicky to code - easy to get an infinite loop   ;
*================================================================;

*================================================================;
* BOTH LINEAR AND BINARY are KEY-COMPARISON SEARCH ALGORITHMS    ;
*----------------------------------------------------------------;
* For both, run-time grows with N                                ;
*================================================================;
* COULD WE DEVISE A SEARCH SCHEME SUCH THAT:                     ;
*----------------------------------------------------------------;
* 1. No comparisons between keys are needed ?                    ;
* 2. Run time does not depend on number of keys in the table N ? ;
*================================================================;

*================================================================;
* KEY-INDEXED SEARCH                                             ;
*================================================================;
* ID key-values are 3-digit integers, i.e. range from 0 to 999.  ;
* [0:999] array has an index-value for every possible key-value. ;
*----------------------------------------------------------------;
* To load, for every key-value ID from KD, we can set D[X]=AGE   ;
* where X=ID.                                                    ;
*----------------------------------------------------------------;
* To search for an ID, we only need to inspect D[X] where X=ID.  ;
* If D[X] is null, ID is not in the table, else AGE=D[X].        ;
*================================================================;

data _null_ ;
  array D [0:999] _temporary_ ;
* load ;
  do until (z) ;
    set kd end = z ;
    X = ID ;
    D[X] = age ;
  end ;
* lookup ;
  do ID = 149, 502, 893, 000 ;
    X = ID ;
    AGE = D[x] ;
    put ID=z3. x=z3. age= ;
  end ;
  stop ;
run ;

*================================================================;
* KEY-INDEXED SEARCH: Pros and Cons                              ;
*================================================================;
* Fastest existing search algorithm - theoretically unbeatable.  ;
* No comparisons between keys are necessary                      ;
* Lookup speed does not depend on number of keys in the table N. ;
* Actually practically applicable in many real-life scenarios.   ;
*================================================================;
* Extreme speed is a trade-off for obscene memory utilization.   ;
* Table D is very sparse: There are 1000 slots for 19 keys.      ;
* Hence: Good for integer keys with limited range.               ;
* Example: SAS dates as keys. [-100000:100000] will cover it.    ;
* BUT NOT SAS datetimes! SSN requires 1E9-item array. Etc.       ;
*================================================================;

*================================================================;
* HASH TABLE SEARCH: THE GREAT COMPROMISE                        ;
*================================================================;
* Keep the central idea of key-indexing: Mapping keys to array   ;
* array indices based on digital properties of the keys.         ;
*----------------------------------------------------------------;
* Make the table less sparse - e.g. approx 1.5-2 times N.        ;
*----------------------------------------------------------------;
* For a set or arbitrary keys, there exists NO a priori rule to  ;
* guarantee one-to-one mapping of key-values to array indices.   ;
* More than 1 key-value will likely map to the same array index. ;
*----------------------------------------------------------------;
* To resolve such COLLISIONS, allow SOME comparisons between     ;
* keys to tell the keys mapping to the same index apart.         ;
*================================================================;

*================================================================;
* STEP BELOW IS JUST A KEY-MAPPING DEMO.                         ;
*================================================================;
* BUT it also shows uses a real COLLISION-RESOLUTION POLICY      ;
* called SEPARATE CHAINING.                                      ;
*================================================================;

%let size = 29 ; * table size, approx 1.5*N (N=19). N / size = LOAD FACTOR ;

data _null_ ;
  array K [&size, 0:5] _temporary_ ;
  do until (z) ;
    set kd end = z ;
    X = mod (ID, &size) + 1 ; * HASH FUNCTION: maps any integer between 1 and &size ;
    k[X, 0] + 1 ;
    chain_seq = k[X, 0] ;     * Number of key-values mapped to this index X as yet  ;
    k[X, chain_seq] = id ;
  end ;
* show distribution of ID key-values among indices X ;
  do X = 1 to hbound (k, 1) ;
    put X=z2. k[X, 0]=1. @ ;
    do chain_seq = 1 to hbound (k, 2) ;
      put +1 k[X, chain_seq] z3. @ ;
    end ;
    put ;
  end ;
* do ID = 149, 502, 893, 000 ;
*   <add lookup code here as an exercise> ;
* end ;
run ;

*================================================================;
* You may want to try coding the lookup part absent from above   ;
* as a useful practical hash-coding etude. Note: you will need   ;
* to add another 2-dim array D[&size, 0:5] for the data AGE.     ;
*================================================================;

*================================================================;
* DEMO TAKEAWAYS                                                 ;
*----------------------------------------------------------------;
* For a hash table of a given load factor, run time is O(1).     ;
* Does not depend on the number of keys N in the hash table.     ;
*----------------------------------------------------------------;
* We do not know what the 2nd dimension should be beforehand.    ;
* Thus, have to allocate as "big enough" or run twice to get it. ;
* 2-dim array is workable but too hard on memory footprint.      ;
*----------------------------------------------------------------;
* SAS hash object side note: The separate chains are organized   ;
* as AVL binary trees.                                           ;
*================================================================;

*================================================================;
* OPEN-ADDRESSING HASHING WITH LINEAR PROBING                    ;
*================================================================;
* Use 2 parallel 1-dim arrays K / D for keys(ID) / data (AGE).   ;
* Choose its size as the least prime number > 1.5*N.             ;
*----------------------------------------------------------------;
* N / &size (=2/3 here( is termed a LOAD FACTOR.                 ;
* The LOWER the load factor, the SPARSER the hash table.         ;
*----------------------------------------------------------------;
* LOAD:                                                          ;
* ----                                                           ;
* 1. Map ID to X as X = mod (ID, &size) + 1.                     ;
* 4. Traverse K using X = X by 1 until (K[X]=. or K[X]=ID).      ;
* 2. If X > &size, set X=1, i.e. go to the beginning of K.       ;
* 3. If K[X]=., store K[X]=ID, D[X]=AGE.                         ;
* 6. Else if K[X]=ID, key is a dupe. Get next ID and go to #1.   ;
*----------------------------------------------------------------;
* SEARCH:                                                        ;
* ------                                                         ;
* 1. Map ID to X as X = mod (ID, &size) + 1.                     ;
* 2. Loop thru K using X = X by 1 until (K[X]=. or K[X]=ID).     ;
* 3. If X > &size, set X=1, i.e. go to the beginning of K.       ;
* 4. If K[X]=., the key is not in the table. Stop.               ;
* 5. Else key is found, so set AGE = D[X].                       ;
*================================================================;

%let size = 29 ; * table size, approx 2*N (N=19) ;

data _null_ ;
  array K [&size] _temporary_ ;
  array D [&size] _temporary_ ;
  put "------ X --> " @ ;
  do X = 1 to &size ;
    put x z2. @ +2 ;
  end ;
  put / 127 * "-" ;
* load ;
  do until (z) ;
    set kd end = z ;
    do x = mod (ID, &size) + 1 by 1 until (K[x]=. or K[x]=ID) ;
      if x > &size then x = 1 ;
    end ;
    if missing (K[x]) then do ;
      K[x] = ID ;
      D[x] = AGE ;
    end ;
    put ID=z3. X=z2. @ ;
    do x = 1 to &size ;
      put K[x] z3. @ +1 ;
    end ;
    put ;
  end ;
  put ;
* lookup ;
  do ID = 149, 502, 893, 000 ;
    call missing (age) ;
    do x = mod (ID, &size) + 1 by 1 until (K[x]=. or K[x]=ID) ;
      if x > &size then x = 1 ;
    end ;
    if not missing (K[x]) then age = D[x] ;
    put ID=z3. x=z2. age= ;
  end ;
  stop ;
run ;

*================================================================;
* REDUCING PRIMARY CLUSTERING                                    ;
*================================================================;
* Use a prime increment step > 1, e.g. 3, 5, 7, etc.             ;
*----------------------------------------------------------------;
* Use variable prime step depending on key (DOUBLE HASHING).     ;
* -- Not covered here.                                           ;
*================================================================;

%let size = 29 ; * table size, approx 2*N (N=19) ;

data _null_ ;
  array K [&size] _temporary_ ;
  array D [&size] _temporary_ ;
  put "------ X --> " @ ;
  do X = 1 to &size ;
    put x z2. @ +2 ;
  end ;
  put / 127 * "-" ;
* load ;
  do until (z) ;
    set kd end = z ;
    link traverse ;
    if missing (K[x]) then do ;
      K[x] = ID ;
      D[x] = AGE ;
    end ;
    put ID=z3. X=z2. @ ;
    do x = 1 to &size ;
      put K[x] z3. @ +1 ;
    end ;
    put ;
  end ;
  put ;
* lookup ;
  do ID = 149, 502, 893, 000 ;
    call missing (age) ;
    link traverse ;
    if not missing (K[x]) then age = D[x] ;
    put ID=z3. x=z2. age= ;
  end ;
  stop ;
  traverse: retain step 7 ;
            do x = mod (ID, &size) + 1 by step until (K[x]=. or K[x]=ID) ;
              if x > &size then x = &size - x + 1 + step ;
            end ;
run ;

*================================================================;
* HASHING NON-INTEGER KEYS: CHARACTER KEYS                       ;
*================================================================;
* Need to obtain a numeric equivalent of character key to be     ;
* in the MOD function.                                           ;
*----------------------------------------------------------------;
* Getting a full equivalent is mostly impossible since a SAS     ;
* string is basically a 256-radix number.                        ;
*----------------------------------------------------------------;
* Good fast approximation is converting first 6 characters using ;
* the PIB6. informat.                                            ;
*----------------------------------------------------------------;
* Alternative: Using PIB6. against MD5(key). Not as fast but:    ;
* - Involves all characters in the hash function                 ;
* - Thus yields guaranteed good random distribution              ;
*================================================================;
* Suppose now that our hash key is NAME ($8) instead of ID:      ;
* Key: NAME, Data: AGE                                           ;
*================================================================;

%let size = 29 ; * table size, approx 2*N (N=19) ;

data _null_ ;
  array K [&size] $ 8 _temporary_ ;
  array D [&size]     _temporary_ ;
* load ;
  do until (z) ;
    set kd end = z ;
    link traverse ;
    if cmiss (K[x]) then do ;
      K[x] = name ;
      D[x] = age ;
    end ;
  end ;
* lookup ;
  do name = "Alfred", "Jeffrey", "William", "Quentin" ;
    call missing (age) ;
    link traverse ;
    if not cmiss (K[x]) then age = D[x] ;
    put name= @14 age= ;
  end ;
  stop ;
  traverse:
  *  do x = mod (input (     name , pib6.), &size) + 1 by 1 until (cmiss (K[x]) or K[x]=name) ;
   do x = mod (input (MD5 (name), pib6.), &size) + 1 by 1 until (cmiss (K[x]) or K[x]=name) ;
      if x > &size then x = 1 ;
    end ;
run ;

*================================================================;
* HASHING COMPOSITE KEYS                                         ;
*================================================================;
* 1. Cat the keys: C = CATX (<dlm>, K1, K2, ...)                 ;
* 2. Plug CAT into MD5: H = MD5 (C)                              ;
* 3. Plug CATMD5 into INPUT: HN = INPUT (H, PIB6.)               ;
* 4. Plug CATMD5IN into MOD: X = MOD (HN, &size) + 1             ;
*----------------------------------------------------------------;
* Store H ($16) in an array K (instead of 2 arrays) as key-value ;
*----------------------------------------------------------------;
* Suppose now that our hash key is [ID(8), NAME($8)]             ;
* Key: [ID, NAME], Data: AGE                                     ;
*================================================================;

%let size = 29 ; * table size, approx 2*N (N=19) ;

data _null_ ;
  array K [&size] $ 16 _temporary_ ; * store H=MD5(CATx(":",ID,name)) here ;
  array D [&size]      _temporary_ ;
* load ;
  do until (z) ;
    set kd end = z ;
    link traverse ;
    if cmiss (K[x]) then do ;
      K[x] = H ;
      D[x] = age ;
    end ;
  end ;
* lookup ;
  do ID = 149, 502, 893, 000 ;
    select (id) ;
      when (149) name = "Alfred"  ;
      when (502) name = "Jeffrey" ;
      when (893) name = "William" ;
      otherwise  name = "Quentin" ;
    end ;
    call missing (age) ;
    link traverse ;
    if not cmiss (K[x]) then age = D[x] ;
    put ID=z3. name= @21 age= ;
  end ;
  stop ;
  traverse:
    H = put (md5 (catx (":", ID, name)), $16.) ;
    do x = mod (input (H, pib6.), &size) + 1 by 1 until (cmiss (K[x]) or K[x]=H) ;
      if x > &size then x = 1 ;
    end ;
run ;