1 module corpushash.hashers; 2 3 import std.stdio; 4 import std.range; 5 import std.digest.sha; 6 import std.base64; 7 import std.path : buildPath, isValidFilename; 8 import std.format; 9 import std.uni; 10 import std.conv : to; 11 import std.random : randomSample; 12 import std.range : chain; 13 import std.typecons : tuple, Tuple; 14 import std.file; 15 import std.json; 16 import std.traits; 17 import std.datetime; 18 import pyd.pyd; 19 20 alias dictionary = dstring[string]; 21 alias Ddictionary = dstring[2][string]; 22 alias document = dstring[]; 23 24 class HashCorpus 25 { 26 document[] corpus; 27 string corpus_path; 28 string public_path; 29 string encoding; 30 uint salt_length; 31 dictionary encode_dictionary; 32 Ddictionary decode_dictionary; 33 string encode_dictionary_path; 34 string decode_dictionary_path; 35 36 this(document[] corpus, string corpus_path, string encoding = "utf-32", 37 const uint salt_length = 32) 38 { 39 this.corpus = corpus; 40 this.corpus_path = corpus_path; 41 this.public_path = this.setup_corpus_path(); 42 this.encoding = encoding; 43 this.salt_length = salt_length; 44 this.encode_dictionary_path = buildPath(this.corpus_path, "private", 45 "encode_dictionary.json"); 46 this.decode_dictionary_path = buildPath(this.corpus_path, "private", 47 "decode_dictionary.json"); 48 this._load_dictionaries(); 49 //this.encode_dictionary = dicts[0]; 50 //this.decode_dictionary = dicts[1]; 51 writeln(this.encode_dictionary); 52 this.hash_corpus(); 53 54 } 55 ///Sets up the output path 56 string setup_corpus_path() 57 { 58 writefln("setting up output directory on: %s", this.corpus_path); 59 auto currentTime = Clock.currTime(); 60 string timeString = currentTime.toISOString(); 61 string public_path = buildPath(this.corpus_path, "public", timeString); 62 mkdirRecurse(public_path); 63 string priv = buildPath(this.corpus_path, "private"); 64 if (!priv.exists) 65 { 66 mkdirRecurse(priv); 67 } 68 return public_path; 69 } 70 ///Hashes the corpus 71 void hash_corpus() 72 { 73 uint ix = 0; 74 foreach (i, doc; this.corpus) 75 { 76 document output_document = doc.dup; // copying here because the next method is recursive 77 document encoded_document = this._hash_document(doc, output_document); 78 auto encoded_document_path = buildPath(this.public_path, format!"%s.json"(i,)); 79 this._export_encoded_document(encoded_document, encoded_document_path); 80 ix += i; 81 } 82 83 this._export_dictionary(this.encode_dictionary, this.encode_dictionary_path); 84 this._export_Ddictionary(this.decode_dictionary, this.decode_dictionary_path); 85 writefln("%s documents hashed and saved to %s.", ix + 1, this.public_path); 86 87 } 88 89 ///Encodes one token 90 dstring _encode_token(dstring token) 91 { 92 string token_str; 93 string hashed_token; 94 dstring salt; 95 96 token_str = to!string(token); 97 98 if ((token_str in this.encode_dictionary) !is null) 99 { 100 writeln("using existing hash"); 101 return this.encode_dictionary[token_str]; 102 } 103 else 104 { 105 auto res = hash_token(token); 106 hashed_token = res[0]; 107 salt = res[1]; 108 109 while (hashed_token in this.decode_dictionary) 110 { 111 res = hash_token(token); 112 hashed_token = res[0]; 113 salt = res[1]; 114 } 115 this.decode_dictionary[hashed_token] = [token, salt]; 116 this.encode_dictionary[token_str] = to!dstring(hashed_token); 117 } 118 return to!dstring(hashed_token); 119 } 120 121 document _hash_document(document input_document, document output_document) 122 { 123 foreach (ix, item; input_document) 124 { 125 if (isSomeString!(typeof(item))) 126 { 127 output_document[ix] = to!dstring(this._encode_token(item)); 128 } 129 else 130 { 131 throw new FileException("Document must be a list of strings"); 132 } 133 } 134 135 return output_document; 136 } 137 138 void _export_dictionary(dictionary file_to_dump, string file_path) 139 { 140 JSONValue payload = JSONValue(file_to_dump); 141 File(file_path, "w").write(payload.toJSON); 142 } 143 144 void _export_Ddictionary(Ddictionary file_to_dump, string file_path) 145 { 146 JSONValue payload = JSONValue(file_to_dump); 147 File(file_path, "w").write(payload.toJSON); 148 } 149 150 void _export_encoded_document(document doc_to_dump, string file_path) 151 { 152 JSONValue payload = JSONValue(doc_to_dump); 153 File(file_path, "w").write(payload.toJSON); 154 } 155 156 /** 157 * Load prevously saved dictionaries 158 * 159 * <detailed description> 160 * 161 * Params: 162 */ 163 void _load_dictionaries() 164 { 165 if (this.encode_dictionary_path.exists && this.decode_dictionary_path.exists) 166 { 167 writeln("Dictionaries from previous hashing found.\n Loading them."); 168 JSONValue encode_dictionary = this.encode_dictionary_path.readText.parseJSON; 169 JSONValue decode_dictionary = this.decode_dictionary_path.readText.parseJSON; 170 } 171 else 172 { 173 dictionary encode_dictionary; 174 Ddictionary decode_dictionary; 175 } 176 writeln(encode_dictionary); 177 this.encode_dictionary = encode_dictionary; 178 this.decode_dictionary = decode_dictionary; 179 // return [encode_dictionary, decode_dictionary; 180 } 181 } 182 183 /** 184 * Hashes a string adding a random salt to it of specified size 185 * params: 186 * token = string to be hashed 187 * salt = Salt to add 188 * salt_length = Length of the salt in bits 189 */ 190 Tuple!(string,dstring) hash_token(dstring token, dstring salt = null, uint salt_length = 32) 191 { 192 if (salt is null) 193 { 194 salt = get_salt(salt_length); 195 } 196 auto token_hasher = new SHA256Digest(); 197 ubyte[] token_digest = token_hasher.digest(token ~ salt); 198 return tuple(toHexString(token_digest), salt); 199 } 200 /** 201 * Random salt generator 202 * params: 203 * siz = Length of the salt in bits 204 */ 205 dstring get_salt(uint siz) 206 { 207 auto unicodechars = unicode("Cyrillic") | unicode("Armenian") | unicode("Telugu"); 208 dstring unichars = to!(dstring)(unicodechars.byCodepoint); 209 210 return to!dstring(randomSample(unichars, siz)); 211 } 212 213 /** 214 * Python wrapper 215 */ 216 extern (C) void PydMain() 217 { 218 alias document = dstring[]; 219 module_init(); 220 def!(get_salt)(); 221 wrap_class!(HashCorpus, 222 Property!(HashCorpus.corpus), 223 Property!(HashCorpus.corpus_path), 224 Property!(HashCorpus.encoding), 225 Property!(HashCorpus.salt_length), 226 Init!(document[], string, string, const uint),)(); 227 228 } 229 230 /** 231 * Unittests 232 **/ 233 234 /// Testing the hashing 235 unittest 236 { 237 document[] corp = [["asdahsk", "sdlfjsldj","çsldkfçk"],["sdjçlkj","sadjfl"],["sdfçls","oirgk", "sdkfj"]]; 238 HashCorpus H = new HashCorpus(corp, "test_corpus"); 239 assert("asdahsk" in H.encode_dictionary); 240 } 241 242 /// Test the loading of dictionaries 243 unittest 244 { 245 document[] corp = [ 246 ["asdahsk", "sdlfjsldj", "çsldkfçk"], ["sdjçlkj", "sadjfl"], 247 ["sdfçls", "oirgk", "sdkfj"] 248 ]; 249 HashCorpus H = new HashCorpus(corp, "test_corpus"); 250 // Make a copy of the dictionaries 251 dictionary original_enc_dict = H.encode_dictionary; 252 Ddictionary original_dec_dict = H.decode_dictionary; 253 // Force the re-loading from disk again; 254 H._load_dictionaries(); 255 foreach (word; [ 256 "asdahsk", "sdlfjsldj", "çsldkfçk", "sdjçlkj", "sadjfl", 257 "sdfçls", "oirgk", "sdkfj" 258 ]) 259 { 260 assert(original_enc_dict[word] == H.encode_dictionary[word]); 261 } 262 263 assert(isAssociativeArray!(typeof(H.decode_dictionary))); 264 foreach (key, val; H.decode_dictionary) 265 { 266 assert(isSomeString!(typeof(key))); 267 assert(isArray!(typeof(val))); 268 assert(key in original_dec_dict); 269 } 270 271 }