source: GTP/trunk/Lib/Vis/Preprocessing/src/sparsehash/src/hashtable_unittest.cc @ 2162

Revision 2162, 17.6 KB checked in by mattausch, 17 years ago (diff)

improved hash performance with google hashmap

Line 
1// Copyright (c) 2005, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// ---
31// Author: Craig Silverstein
32//
33// This tests <google/sparsehash/densehashtable.h>
34// This tests <google/dense_hash_set>
35// This tests <google/dense_hash_map>
36// This tests <google/sparsehash/sparsehashtable.h>
37// This tests <google/sparse_hash_set>
38// This tests <google/sparse_hash_map>
39
40// Since {dense,sparse}hashtable is templatized, it's important that
41// we test every function in every class in this file -- not just to
42// see if it works, but even if it compiles.
43
44#include <google/sparsehash/config.h>
45#include <stdio.h>
46#include <sys/stat.h>          // for stat()
47#ifdef HAVE_UNISTD_H
48#include <unistd.h>            // for unlink()
49#endif
50#include <string.h>
51#include <time.h>              // for silly random-number-seed generator
52#include <math.h>              // for sqrt()
53#include <map>
54#include <iterator>            // for insert_iterator
55#include <iostream>
56#include <iomanip>             // for setprecision()
57#include <string>
58#include <google/sparsehash/hash_fun.h>
59#include <google/dense_hash_map>
60#include <google/dense_hash_set>
61#include <google/sparsehash/densehashtable.h>
62#include <google/sparse_hash_map>
63#include <google/sparse_hash_set>
64#include <google/sparsehash/sparsehashtable.h>
65
66using GOOGLE_NAMESPACE::sparse_hash_map;
67using GOOGLE_NAMESPACE::dense_hash_map;
68using GOOGLE_NAMESPACE::sparse_hash_set;
69using GOOGLE_NAMESPACE::dense_hash_set;
70using GOOGLE_NAMESPACE::sparse_hashtable;
71using GOOGLE_NAMESPACE::dense_hashtable;
72using STL_NAMESPACE::map;
73using STL_NAMESPACE::pair;
74using STL_NAMESPACE::string;
75using STL_NAMESPACE::insert_iterator;
76using STL_NAMESPACE::allocator;
77using STL_NAMESPACE::equal_to;
78using STL_NAMESPACE::ostream;
79
80#define LOGF  STL_NAMESPACE::cout   // where we log to; LOGF is a historical name
81
82#define CHECK(cond)  do {                       \
83  if (!(cond)) {                                \
84    LOGF << "Test failed: " #cond "\n";         \
85    exit(1);                                    \
86  }                                             \
87} while (0)
88
89const char *words[] = {"Baffin\n",        // in /usr/dict/words
90                       "Boffin\n",        // not in
91                       "baffin\n",        // not in
92                       "genial\n",        // last word in
93                       "Aarhus\n",        // first word alphabetically
94                       "Zurich\n",        // last word alphabetically
95};
96
97const char *nwords[] = {"Boffin\n",
98                        "baffin\n",
99};
100
101// Apparently identity is not stl-standard, so we define our own
102template<class Value>
103struct Identity {
104  Value& operator()(Value& v) const { return v; }
105  const Value& operator()(const Value& v) const { return v; }
106};
107
108// Let us log the pairs that make up a hash_map
109template<class P1, class P2>
110ostream& operator<<(ostream& s, const pair<P1, P2>& p) {
111  s << "pair(" << p.first << ", " << p.second << ")";
112  return s;
113}
114
115struct strcmp_fnc {
116  bool operator()(const char* s1, const char* s2) const {
117    return ((s1 == 0 && s2 == 0) ||
118            (s1 && s2 && *s1 == *s2 && strcmp(s1, s2) == 0));
119  }
120};
121
122
123template <class SparseTable, class T>
124static void set_empty_key(SparseTable *sp, T val) {
125}
126
127template <class T, class H, class I, class C, class A>
128static void set_empty_key(dense_hashtable<T,T,H,I,C,A> *ht, T val) {
129  ht->set_empty_key(val);
130}
131
132template <class T, class H, class C>
133static void set_empty_key(dense_hash_set<T,H,C> *ht, T val) {
134  ht->set_empty_key(val);
135}
136
137template <class K, class V, class H, class C>
138static void set_empty_key(dense_hash_map<K,V,H,C> *ht, K val) {
139  ht->set_empty_key(val);
140}
141
142
143template <class T, class H, class I, class C, class A>
144static void insert(dense_hashtable<T,T,H,I,C,A> *ht, T val) {
145  ht->insert(val);
146}
147
148template <class T, class H, class C>
149static void insert(dense_hash_set<T,H,C> *ht, T val) {
150  ht->insert(val);
151}
152
153template <class K, class V, class H, class C>
154static void insert(dense_hash_map<K,V,H,C> *ht, K val) {
155  ht->insert(pair<K,V>(val,V()));
156}
157
158template <class T, class H, class I, class C, class A>
159static void insert(sparse_hashtable<T,T,H,I,C,A> *ht, T val) {
160  ht->insert(val);
161}
162
163template <class T, class H, class C>
164static void insert(sparse_hash_set<T,H,C> *ht, T val) {
165  ht->insert(val);
166}
167
168template <class K, class V, class H, class C>
169static void insert(sparse_hash_map<K,V,H,C> *ht, K val) {
170  ht->insert(pair<K,V>(val,V()));
171}
172
173template <class HT, class Iterator>
174static void insert(HT *ht, Iterator begin, Iterator end) {
175  ht->insert(begin, end);
176}
177
178// For hashtable's and hash_set's, the iterator insert works fine (and
179// is used). But for the hash_map's, the iterator insert expects the
180// iterators to point to pair's. So by looping over and calling insert
181// on each element individually, the code below automatically expands
182// into inserting a pair.
183template <class K, class V, class H, class C, class Iterator>
184static void insert(dense_hash_map<K,V,H,C> *ht, Iterator begin, Iterator end) {
185  while (begin != end) {
186    insert(ht, *begin);
187    ++begin;
188  }
189}
190
191template <class K, class V, class H, class C, class Iterator>
192static void insert(sparse_hash_map<K,V,H,C> *ht, Iterator begin, Iterator end) {
193  while (begin != end) {
194    insert(ht, *begin);
195    ++begin;
196  }
197}
198
199// A version of insert that uses the insert_iterator.  But insert_iterator
200// isn't defined for the low level hashtable classes, so we just punt to insert.
201
202template <class T, class H, class I, class C, class A>
203static void iterator_insert(dense_hashtable<T,T,H,I,C,A>* ht, T val,
204                            insert_iterator<dense_hashtable<T,T,H,I,C,A> >* ) {
205  ht->insert(val);
206}
207
208template <class T, class H, class C>
209static void iterator_insert(dense_hash_set<T,H,C>* , T val,
210                            insert_iterator<dense_hash_set<T,H,C> >* ii) {
211  *(*ii)++ = val;
212}
213
214template <class K, class V, class H, class C>
215static void iterator_insert(dense_hash_map<K,V,H,C>* , K val,
216                            insert_iterator<dense_hash_map<K,V,H,C> >* ii) {
217  *(*ii)++ = pair<K,V>(val,V());
218}
219
220template <class T, class H, class I, class C, class A>
221static void iterator_insert(sparse_hashtable<T,T,H,I,C,A>* ht, T val,
222                            insert_iterator<sparse_hashtable<T,T,H,I,C,A> >* ) {
223  ht->insert(val);
224}
225
226template <class T, class H, class C>
227static void iterator_insert(sparse_hash_set<T,H,C>* , T val,
228                            insert_iterator<sparse_hash_set<T,H,C> >* ii) {
229  *(*ii)++ = val;
230}
231
232template <class K, class V, class H, class C>
233static void iterator_insert(sparse_hash_map<K,V,H,C> *, K val,
234                            insert_iterator<sparse_hash_map<K,V,H,C> >* ii) {
235  *(*ii)++ = pair<K,V>(val,V());
236}
237
238
239static void write_item(FILE *fp, const char *val) {
240  fwrite(val, strlen(val), 1, fp);   // \n serves to separate
241}
242
243// The weird 'const' declarations are desired by the compiler. Yucko.
244static void write_item(FILE *fp, const pair<char*const,int> &val) {
245  fwrite(val.first, strlen(val.first), 1, fp);
246}
247
248static char* read_line(FILE* fp, char* line, int linesize) {
249  if ( fgets(line, linesize, fp) == NULL )
250    return NULL;
251  // normalize windows files :-(
252  const int linelen = strlen(line);
253  if ( linelen >= 2 && line[linelen-2] == '\r' && line[linelen-1] == '\n' ) {
254    line[linelen-2] = '\n';
255    line[linelen-1] = '\0';
256  }
257  return line;
258}
259
260static void read_item(FILE *fp, char*const* val) {
261  char line[1024];
262  read_line(fp, line, sizeof(line));
263  char **p = const_cast<char**>(val);
264  *p = strdup(line);
265}
266
267static void read_item(FILE *fp, pair<char*const,int> *val) {
268  char line[1024];
269  read_line(fp, line, sizeof(line));
270  char **p = const_cast<char**>(&val->first);
271  *p = strdup(line);
272}
273
274static void free_item(char*const* val) {
275  free(*val);
276}
277
278static void free_item(pair<char*const,int> *val) {
279  free(val->first);
280}
281
282
283// The read_write parameters specifies whether the read/write tests
284// should be performed. Note that densehashtable::write_metdata is not
285// implemented, so we only do the read/write tests for the
286// sparsehashtable varieties.
287template<class ht, class htint>
288void test(bool read_write) {
289  htint y(1000);
290  htint z(64);
291  set_empty_key(&y, 0xefefef);
292  set_empty_key(&z, 0xefefef);
293
294  CHECK(y.empty());
295  insert(&y, 1);
296  CHECK(!y.empty());
297  insert(&y, 11);
298  insert(&y, 111);
299  insert(&y, 1111);
300  insert(&y, 11111);
301  insert(&y, 111111);
302  insert(&y, 1111111);     // 1M, more or less
303  insert(&y, 11111111);
304  insert(&y, 111111111);
305  insert(&y, 1111111111);  // 1B, more or less
306  for ( int i = 0; i < 32; ++i )
307    insert(&z, i);
308  // test the second half of the insert with an insert_iterator
309  insert_iterator<htint> insert_iter(z, z.begin());
310  for ( int i = 32; i < 64; ++i )
311    iterator_insert(&z, i, &insert_iter);
312
313  for ( typename htint::const_iterator it = y.begin(); it != y.end(); ++it )
314    LOGF << "y: " << *it << "\n";
315  z.insert(y.begin(), y.end());
316  swap(y,z);
317  for ( typename htint::iterator it = y.begin(); it != y.end(); ++it )
318    LOGF << "y+z: " << *it << "\n";
319  LOGF << "z has " << z.bucket_count() << " buckets\n";
320  LOGF << "y has " << y.bucket_count() << " buckets\n";
321  LOGF << "z size: " << z.size() << "\n";
322
323  for (int i = 0; i < 64; ++i)
324    CHECK(y.find(i) != y.end());
325
326  CHECK(z.size() == 10);
327  z.set_deleted_key(1010101010);      // an unused value
328  z.erase(11111);
329  CHECK(z.size() == 9);
330  insert(&z, 11111);                  // should retake deleted value
331  CHECK(z.size() == 10);
332  // Do the delete/insert again.  Last time we probably resized; this time no
333  z.erase(11111);
334  insert(&z, 11111);                  // should retake deleted value
335  CHECK(z.size() == 10);
336
337  z.erase(-11111);                    // shouldn't do anything
338  CHECK(z.size() == 10);
339  z.erase(1);
340  CHECK(z.size() == 9);
341  typename htint::iterator itdel = z.find(1111);
342  z.erase(itdel);
343  CHECK(z.size() == 8);
344  itdel = z.find(2222);               // should be end()
345  z.erase(itdel);                     // shouldn't do anything
346  CHECK(z.size() == 8);
347  for ( typename htint::const_iterator it = z.begin(); it != z.end(); ++it )
348    LOGF << "y: " << *it << "\n";
349  z.set_deleted_key(1010101011);      // a different unused value
350  for ( typename htint::const_iterator it = z.begin(); it != z.end(); ++it )
351    LOGF << "y: " << *it << "\n";
352  LOGF << "That's " << z.size() << " elements\n";
353  z.erase(z.begin(), z.end());
354  CHECK(z.empty());
355
356  y.clear();
357  CHECK(y.empty());
358  LOGF << "y has " << y.bucket_count() << " buckets\n";
359
360  ht w;
361  set_empty_key(&w, (char*) NULL);
362  insert(&w, const_cast<char **>(nwords),
363         const_cast<char **>(nwords) + sizeof(nwords) / sizeof(*nwords));
364  LOGF << "w has " << w.size() << " items\n";
365  CHECK(w.size() == 2);
366  CHECK(w == w);
367
368  ht x;
369  set_empty_key(&x, (char*) NULL);
370  long dict_size = 1;        // for size stats -- can't be 0 'cause of division
371
372  map<string, int> counts;
373  // Hash the dictionary
374  {
375    // automake says 'look for all data files in $srcdir.'  OK.
376    string filestr = (string(getenv("srcdir") ? getenv("srcdir") : ".") +
377                      "/src/words");
378    const char* file = filestr.c_str();
379    FILE *fp = fopen(file, "rb");
380    if ( fp == NULL ) {
381      LOGF << "Can't open " << file << " skipping dictionary hash...";
382    } else {
383      char line[1024];
384      while ( read_line(fp, line, sizeof(line)) ) {
385        insert(&x, strdup(line));
386        counts[line] = 0;
387      }
388      LOGF << "Read " << x.size() << " words from " << file << "\n";
389      fclose(fp);
390      struct stat buf;
391      stat(file, &buf);
392      dict_size = buf.st_size;
393      LOGF << "Size of " << file << ": " << buf.st_size << " bytes\n";
394      for ( char **word = const_cast<char **>(words);
395            word < const_cast<char **>(words) + sizeof(words) / sizeof(*words);
396            ++word ) {
397        if (x.find(*word) == x.end()) {
398          CHECK(w.find(*word) != w.end());
399        } else {
400          CHECK(w.find(*word) == w.end());
401        }
402      }
403    }
404  }
405  CHECK(counts.size() == x.size());
406
407  // Save the hashtable.
408  if (read_write) {
409    const char* file = "/tmp/#hashtable_unittest_dicthash";
410    FILE *fp = fopen(file, "wb");
411    if ( fp == NULL ) {
412      // maybe we can't write to /tmp/.  Try the current directory
413      file = "#hashtable_unittest_dicthash";
414      fp = fopen(file, "wb");
415    }
416    if ( fp == NULL ) {
417      LOGF << "Can't open " << file << " skipping hashtable save...";
418    } else {
419      x.write_metadata(fp);        // this only writes meta-information
420      int count = 0;
421      for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) {
422        write_item(fp, *it);
423        free_item(&(*it));
424        ++count;
425      }
426      LOGF << "Wrote " << count << " words to " << file << "\n";
427      fclose(fp);
428      struct stat buf;
429      stat(file, &buf);
430      LOGF << "Size of " << file << ": " << buf.st_size << " bytes\n";
431      LOGF << STL_NAMESPACE::setprecision(3)
432           << "Hashtable overhead "
433           << (buf.st_size - dict_size) * 100.0 / dict_size
434           << "% ("
435           << (buf.st_size - dict_size) * 8.0 / count
436           << " bits/entry)\n";
437      x.clear();
438
439      // Load the hashtable
440      fp = fopen(file, "rb");
441      if ( fp == NULL ) {
442        LOGF << "Can't open " << file << " skipping hashtable reload...";
443      } else {
444        x.read_metadata(fp);      // reads metainformation
445        LOGF << "Hashtable size: " << x.size() << "\n";
446        int count = 0;
447        for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) {
448          read_item(fp, &(*it));
449          ++count;
450        }
451        LOGF << "Read " << count << " words from " << file << "\n";
452        fclose(fp);
453        unlink(file);
454        for ( char **word = const_cast<char **>(words);
455              word < const_cast<char **>(words) + sizeof(words) / sizeof(*words);
456              ++word ) {
457          if (x.find(*word) == x.end()) {
458            CHECK(w.find(*word) != w.end());
459          } else {
460            CHECK(w.find(*word) == w.end());
461          }
462        }
463      }
464    }
465  }
466  for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) {
467    free_item(&(*it));
468  }
469}
470
471int main(int argc, char **argv) {
472  // First try with the low-level hashtable interface
473  LOGF << "\n\nTEST WITH DENSE_HASHTABLE\n\n";
474  test<dense_hashtable<char *, char *, HASH_NAMESPACE::hash<char *>,
475                       Identity<char *>, strcmp_fnc, allocator<char *> >,
476       dense_hashtable<int, int, HASH_NAMESPACE::hash<int>,
477                       Identity<int>, equal_to<int>, allocator<int> > >(
478                         false);
479
480  // Now try with hash_set, which should be equivalent
481  LOGF << "\n\nTEST WITH DENSE_HASH_SET\n\n";
482  test<dense_hash_set<char *, HASH_NAMESPACE::hash<char *>, strcmp_fnc>,
483       dense_hash_set<int> >(false);
484
485  // Now try with hash_map, which differs only in insert()
486  LOGF << "\n\nTEST WITH DENSE_HASH_MAP\n\n";
487  test<dense_hash_map<char *, int, HASH_NAMESPACE::hash<char *>, strcmp_fnc>,
488    dense_hash_map<int, int> >(false);
489
490  // First try with the low-level hashtable interface
491  LOGF << "\n\nTEST WITH SPARSE_HASHTABLE\n\n";
492  test<sparse_hashtable<char *, char *, HASH_NAMESPACE::hash<char *>,
493                       Identity<char *>, strcmp_fnc, allocator<char *> >,
494       sparse_hashtable<int, int, HASH_NAMESPACE::hash<int>,
495                       Identity<int>, equal_to<int>, allocator<int> > >(
496                         true);
497
498  // Now try with hash_set, which should be equivalent
499  LOGF << "\n\nTEST WITH SPARSE_HASH_SET\n\n";
500  test<sparse_hash_set<char *, HASH_NAMESPACE::hash<char *>, strcmp_fnc>,
501       sparse_hash_set<int> >(true);
502
503  // Now try with hash_map, which differs only in insert()
504  LOGF << "\n\nTEST WITH SPARSE_HASH_MAP\n\n";
505  test<sparse_hash_map<char *, int, HASH_NAMESPACE::hash<char *>, strcmp_fnc>,
506       sparse_hash_map<int, int> >(true);
507
508  LOGF << "\nAll tests pass.\n";
509  return 0;
510}
Note: See TracBrowser for help on using the repository browser.