1 module corpushash.hashers;
2 
3 import std.stdio;
4 import std.range;
5 import std.digest.sha;
6 import std.base64;
7 import std.path : buildPath, isValidFilename;
8 import std.format;
9 import std.uni;
10 import std.conv : to;
11 import std.random : randomSample;
12 import std.range : chain;
13 import std.typecons : tuple, Tuple;
14 import std.file;
15 import std.json;
16 import std.traits;
17 import std.datetime;
18 import pyd.pyd;
19 
20 alias dictionary = dstring[string];
21 alias Ddictionary = dstring[2][string];
22 alias document = dstring[];
23 
24 class HashCorpus
25 {
26     document[] corpus;
27     string corpus_path;
28     string public_path;
29     string encoding;
30     uint salt_length;
31     dictionary encode_dictionary;
32     Ddictionary decode_dictionary;
33     string encode_dictionary_path;
34     string decode_dictionary_path;
35 
36     this(document[] corpus, string corpus_path, string encoding = "utf-32",
37             const uint salt_length = 32)
38     {
39         this.corpus = corpus;
40         this.corpus_path = corpus_path;
41         this.public_path = this.setup_corpus_path();
42         this.encoding = encoding;
43         this.salt_length = salt_length;
44         this.encode_dictionary_path = buildPath(this.corpus_path, "private",
45                 "encode_dictionary.json");
46         this.decode_dictionary_path = buildPath(this.corpus_path, "private",
47                 "decode_dictionary.json");
48         this._load_dictionaries();
49         //this.encode_dictionary = dicts[0];
50         //this.decode_dictionary = dicts[1];
51         writeln(this.encode_dictionary);
52         this.hash_corpus();
53 
54     }
55     ///Sets up the output path
56     string setup_corpus_path()
57     {
58         writefln("setting up output directory on: %s", this.corpus_path);
59         auto currentTime = Clock.currTime();
60         string timeString = currentTime.toISOString();
61         string public_path = buildPath(this.corpus_path, "public", timeString);
62         mkdirRecurse(public_path);
63         string priv = buildPath(this.corpus_path, "private");
64         if (!priv.exists)
65         {
66             mkdirRecurse(priv);
67         }
68         return public_path;
69     }
70     ///Hashes the corpus
71     void hash_corpus()
72     {
73         uint ix = 0;
74         foreach (i, doc; this.corpus)
75         {
76             document output_document = doc.dup; // copying here because the next method is recursive
77             document encoded_document = this._hash_document(doc, output_document);
78             auto encoded_document_path = buildPath(this.public_path, format!"%s.json"(i,));
79             this._export_encoded_document(encoded_document, encoded_document_path);
80             ix += i;
81         }
82 
83         this._export_dictionary(this.encode_dictionary, this.encode_dictionary_path);
84         this._export_Ddictionary(this.decode_dictionary, this.decode_dictionary_path);
85         writefln("%s documents hashed and saved to %s.", ix + 1, this.public_path);
86 
87     }
88 
89     ///Encodes one token
90     dstring _encode_token(dstring token)
91     {
92         string token_str;
93         string hashed_token;
94         dstring salt;
95 
96         token_str = to!string(token);
97 
98         if ((token_str in this.encode_dictionary) !is null)
99         {
100             writeln("using existing hash");
101             return this.encode_dictionary[token_str];
102         }
103         else
104         {
105             auto res = hash_token(token);
106             hashed_token = res[0];
107             salt = res[1];
108 
109             while (hashed_token in this.decode_dictionary)
110             {
111                 res = hash_token(token);
112                 hashed_token = res[0];
113                 salt = res[1];
114             }
115             this.decode_dictionary[hashed_token] = [token, salt];
116             this.encode_dictionary[token_str] = to!dstring(hashed_token);
117         }
118         return to!dstring(hashed_token);
119     }
120 
121     document _hash_document(document input_document, document output_document)
122     {
123         foreach (ix, item; input_document)
124         {
125             if (isSomeString!(typeof(item)))
126             {
127                 output_document[ix] = to!dstring(this._encode_token(item));
128             }
129             else
130             {
131                 throw new FileException("Document must be a list of strings");
132             }
133         }
134 
135         return output_document;
136     }
137 
138     void _export_dictionary(dictionary file_to_dump, string file_path)
139     {
140         JSONValue payload = JSONValue(file_to_dump);
141         File(file_path, "w").write(payload.toJSON);
142     }
143 
144     void _export_Ddictionary(Ddictionary file_to_dump, string file_path)
145     {
146         JSONValue payload = JSONValue(file_to_dump);
147         File(file_path, "w").write(payload.toJSON);
148     }
149 
150     void _export_encoded_document(document doc_to_dump, string file_path)
151     {
152         JSONValue payload = JSONValue(doc_to_dump);
153         File(file_path, "w").write(payload.toJSON);
154     }
155 
156     /**
157      * Load prevously saved dictionaries
158      * 
159      * <detailed description>
160      *
161      * Params:
162      */
163     void _load_dictionaries()
164     {
165         if (this.encode_dictionary_path.exists && this.decode_dictionary_path.exists)
166         {
167             writeln("Dictionaries from previous hashing found.\n Loading them.");
168             JSONValue encode_dictionary = this.encode_dictionary_path.readText.parseJSON;
169             JSONValue decode_dictionary = this.decode_dictionary_path.readText.parseJSON;
170         }
171         else
172         {
173             dictionary encode_dictionary;
174             Ddictionary decode_dictionary;
175         }
176         writeln(encode_dictionary);
177         this.encode_dictionary = encode_dictionary;
178         this.decode_dictionary = decode_dictionary;
179         // return [encode_dictionary, decode_dictionary;
180     }
181 }
182 
183 /**
184 * Hashes a string adding a random salt to it of specified size
185 * params:
186 *   token = string to be hashed
187 *   salt = Salt to add
188 *   salt_length = Length of the salt in bits
189 */
190 Tuple!(string,dstring) hash_token(dstring token, dstring salt = null, uint salt_length = 32)
191 {
192     if (salt is null)
193     {
194         salt = get_salt(salt_length);
195     }
196     auto token_hasher = new SHA256Digest();
197     ubyte[] token_digest = token_hasher.digest(token ~ salt);
198     return tuple(toHexString(token_digest), salt);
199 }
200 /**
201 *  Random salt generator
202 *  params:
203 *  siz = Length of the salt in bits
204 */
205 dstring get_salt(uint siz)
206 {
207     auto unicodechars = unicode("Cyrillic") | unicode("Armenian") | unicode("Telugu");
208     dstring unichars = to!(dstring)(unicodechars.byCodepoint);
209 
210     return to!dstring(randomSample(unichars, siz));
211 }
212 
213 /**
214 * Python wrapper
215 */
216 extern (C) void PydMain()
217 {
218     alias document = dstring[];
219     module_init();
220     def!(get_salt)();
221     wrap_class!(HashCorpus,
222             Property!(HashCorpus.corpus),
223             Property!(HashCorpus.corpus_path),
224             Property!(HashCorpus.encoding),
225             Property!(HashCorpus.salt_length),
226             Init!(document[], string, string, const uint),)();
227 
228 }
229 
230 /**
231 * Unittests
232 **/
233 
234 /// Testing the hashing
235 unittest
236 {
237     document[] corp = [["asdahsk", "sdlfjsldj","çsldkfçk"],["sdjçlkj","sadjfl"],["sdfçls","oirgk", "sdkfj"]];
238     HashCorpus H = new HashCorpus(corp, "test_corpus");
239     assert("asdahsk" in H.encode_dictionary);
240 }
241 
242 /// Test the loading of dictionaries
243 unittest
244 {
245     document[] corp = [
246         ["asdahsk", "sdlfjsldj", "çsldkfçk"], ["sdjçlkj", "sadjfl"],
247         ["sdfçls", "oirgk", "sdkfj"]
248     ];
249     HashCorpus H = new HashCorpus(corp, "test_corpus");
250     // Make a copy of the dictionaries
251     dictionary original_enc_dict = H.encode_dictionary;
252     Ddictionary original_dec_dict = H.decode_dictionary;
253     // Force the re-loading from disk again;
254     H._load_dictionaries();
255     foreach (word; [
256             "asdahsk", "sdlfjsldj", "çsldkfçk", "sdjçlkj", "sadjfl",
257             "sdfçls", "oirgk", "sdkfj"
258         ])
259     {
260         assert(original_enc_dict[word] == H.encode_dictionary[word]);
261     }
262 
263     assert(isAssociativeArray!(typeof(H.decode_dictionary)));
264     foreach (key, val; H.decode_dictionary)
265     {
266         assert(isSomeString!(typeof(key)));
267         assert(isArray!(typeof(val)));
268         assert(key in original_dec_dict);
269     }
270 
271 }