source: GTP/trunk/Lib/Vis/Preprocessing/src/sparsehash/src/google/sparsehash/sparsehashtable.h @ 2162

Revision 2162, 37.0 KB checked in by mattausch, 17 years ago (diff)

improved hash performance with google hashmap

Line 
1// Copyright (c) 2005, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// ---
31// Author: Craig Silverstein
32//
33// A sparse hashtable is a particular implementation of
34// a hashtable: one that is meant to minimize memory use.
35// It does this by using a *sparse table* (cf sparsetable.h),
36// which uses between 1 and 2 bits to store empty buckets
37// (we may need another bit for hashtables that support deletion).
38//
39// When empty buckets are so cheap, an appealing hashtable
40// implementation is internal probing, in which the hashtable
41// is a single table, and collisions are resolved by trying
42// to insert again in another bucket.  The most cache-efficient
43// internal probing schemes are linear probing (which suffers,
44// alas, from clumping) and quadratic probing, which is what
45// we implement by default.
46//
47// Deleted buckets are a bit of a pain.  We have to somehow mark
48// deleted buckets (the probing must distinguish them from empty
49// buckets).  The most principled way is to have another bitmap,
50// but that's annoying and takes up space.  Instead we let the
51// user specify an "impossible" key.  We set deleted buckets
52// to have the impossible key.
53//
54// Note it is possible to change the value of the delete key
55// on the fly; you can even remove it, though after that point
56// the hashtable is insert_only until you set it again.
57//
58// You probably shouldn't use this code directly.  Use
59// <google/sparse_hash_table> or <google/sparse_hash_set> instead.
60
61// You can change the following below:
62// HT_OCCUPANCY_FLT      -- how full before we double size
63// HT_EMPTY_FLT          -- how empty before  we halve size
64// HT_MIN_BUCKETS        -- smallest bucket size
65//
66// How to decide what values to use?
67// HT_EMPTY_FLT's default of .4 * OCCUPANCY_FLT, is probably good.
68// HT_MIN_BUCKETS is probably unnecessary since you can specify
69// (indirectly) the starting number of buckets at construct-time.
70// For HT_OCCUPANCY_FLT, you can use this chart to try to trade-off
71// expected lookup time to the space taken up.  By default, this
72// code uses quadratic probing, though you can change it to linear
73// via _JUMP below if you really want to.
74//
75// From http://www.augustana.ca/~mohrj/courses/1999.fall/csc210/lecture_notes/hashing.html
76// NUMBER OF PROBES / LOOKUP       Successful            Unsuccessful
77// Quadratic collision resolution   1 - ln(1-L) - L/2    1/(1-L) - L - ln(1-L)
78// Linear collision resolution     [1+1/(1-L)]/2         [1+1/(1-L)2]/2
79//
80// -- HT_OCCUPANCY_FLT --         0.10  0.50  0.60  0.75  0.80  0.90  0.99
81// QUADRATIC COLLISION RES.
82//    probes/successful lookup    1.05  1.44  1.62  2.01  2.21  2.85  5.11
83//    probes/unsuccessful lookup  1.11  2.19  2.82  4.64  5.81  11.4  103.6
84// LINEAR COLLISION RES.
85//    probes/successful lookup    1.06  1.5   1.75  2.5   3.0   5.5   50.5
86//    probes/unsuccessful lookup  1.12  2.5   3.6   8.5   13.0  50.0  5000.0
87
88
89#ifndef _SPARSEHASHTABLE_H_
90#define _SPARSEHASHTABLE_H_
91
92#ifndef SPARSEHASH_STAT_UPDATE
93#define SPARSEHASH_STAT_UPDATE(x) ((void) 0)
94#endif
95
96// The probing method
97// Linear probing
98// #define JUMP_(key, num_probes)    ( 1 )
99// Quadratic-ish probing
100#define JUMP_(key, num_probes)    ( num_probes )
101
102
103// Hashtable class, used to implement the hashed associative containers
104// hash_set and hash_map.
105
106#include <google/sparsehash/config.h>
107#include <assert.h>
108#include <algorithm>              // For swap(), eg
109#include <google/sparsetable>     // Since that's basically what we are
110
111_START_GOOGLE_NAMESPACE_
112
113using STL_NAMESPACE::pair;
114
115// sparsetable uses malloc()/realloc()/free(), so Alloc is basically
116// ignored.  We include it because we're being compatible.
117// TODO(csilvers): is that the right thing to do?
118
119template <class Value, class Key, class HashFcn,
120          class ExtractKey, class EqualKey, class Alloc>
121class sparse_hashtable;
122
123template <class V, class K, class HF, class ExK, class EqK, class A>
124struct sparse_hashtable_iterator;
125
126template <class V, class K, class HF, class ExK, class EqK, class A>
127struct sparse_hashtable_const_iterator;
128
129// As far as iterating, we're basically just a sparsetable
130// that skips over deleted elements.
131template <class V, class K, class HF, class ExK, class EqK, class A>
132struct sparse_hashtable_iterator {
133 public:
134  typedef sparse_hashtable<V,K,HF,ExK,EqK,A>                sparse_hashtable;
135  typedef sparse_hashtable_iterator<V,K,HF,ExK,EqK,A>       iterator;
136  typedef sparse_hashtable_const_iterator<V,K,HF,ExK,EqK,A> const_iterator;
137  typedef typename sparsetable<V>::nonempty_iterator        st_iterator;
138
139#ifdef UNDERSTANDS_ITERATOR_TAGS
140  typedef STL_NAMESPACE::forward_iterator_tag iterator_category;
141#endif
142  typedef V value_type;
143  typedef ptrdiff_t difference_type;
144  typedef size_t size_type;
145  typedef V& reference;                // Value
146  typedef V* pointer;
147
148  // "Real" constructor and default constructor
149  sparse_hashtable_iterator(const sparse_hashtable *h,
150                            st_iterator it, st_iterator it_end)
151    : ht(h), pos(it), end(it_end)   { advance_past_deleted(); }
152  sparse_hashtable_iterator() { }      // not ever used internally
153  // The default destructor is fine; we don't define one
154  // The default operator= is fine; we don't define one
155
156  // Happy dereferencer
157  reference operator*() const { return *pos; }
158  pointer operator->() const { return &(operator*()); }
159
160  // Arithmetic.  The only hard part is making sure that
161  // we're not on a marked-deleted array element
162  void advance_past_deleted() {
163    while ( pos != end && ht->test_deleted(*this) )
164      ++pos;
165  }
166  iterator& operator++()   {
167    assert(pos != end); ++pos; advance_past_deleted(); return *this;
168  }
169  iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; }
170
171  // Comparison.
172  bool operator==(const iterator& it) const { return pos == it.pos; }
173  bool operator!=(const iterator& it) const { return pos != it.pos; }
174
175
176  // The actual data
177  const sparse_hashtable *ht;
178  st_iterator pos, end;
179};
180
181// Now do it all again, but with const-ness!
182template <class V, class K, class HF, class ExK, class EqK, class A>
183struct sparse_hashtable_const_iterator {
184 public:
185  typedef sparse_hashtable<V,K,HF,ExK,EqK,A>                sparse_hashtable;
186  typedef sparse_hashtable_iterator<V,K,HF,ExK,EqK,A>       iterator;
187  typedef sparse_hashtable_const_iterator<V,K,HF,ExK,EqK,A> const_iterator;
188  typedef typename sparsetable<V>::const_nonempty_iterator  st_iterator;
189
190#ifdef UNDERSTANDS_ITERATOR_TAGS
191  typedef STL_NAMESPACE::forward_iterator_tag iterator_category;
192#endif
193  typedef V value_type;
194  typedef ptrdiff_t difference_type;
195  typedef size_t size_type;
196  typedef const V& reference;                // Value
197  typedef const V* pointer;
198
199  // "Real" constructor and default constructor
200  sparse_hashtable_const_iterator(const sparse_hashtable *h,
201                                  st_iterator it, st_iterator it_end)
202    : ht(h), pos(it), end(it_end)   { advance_past_deleted(); }
203  // This lets us convert regular iterators to const iterators
204  sparse_hashtable_const_iterator() { }      // never used internally
205  sparse_hashtable_const_iterator(const iterator &it)
206    : ht(it.ht), pos(it.pos), end(it.end) { }
207  // The default destructor is fine; we don't define one
208  // The default operator= is fine; we don't define one
209
210  // Happy dereferencer
211  reference operator*() const { return *pos; }
212  pointer operator->() const { return &(operator*()); }
213
214  // Arithmetic.  The only hard part is making sure that
215  // we're not on a marked-deleted array element
216  void advance_past_deleted() {
217    while ( pos != end && ht->test_deleted(*this) )
218      ++pos;
219  }
220  const_iterator& operator++() {
221    assert(pos != end); ++pos; advance_past_deleted(); return *this;
222  }
223  const_iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; }
224
225  // Comparison.
226  bool operator==(const const_iterator& it) const { return pos == it.pos; }
227  bool operator!=(const const_iterator& it) const { return pos != it.pos; }
228
229
230  // The actual data
231  const sparse_hashtable *ht;
232  st_iterator pos, end;
233};
234
235// And once again, but this time freeing up memory as we iterate
236template <class V, class K, class HF, class ExK, class EqK, class A>
237struct sparse_hashtable_destructive_iterator {
238 public:
239  typedef sparse_hashtable<V,K,HF,ExK,EqK,A>                sparse_hashtable;
240  typedef sparse_hashtable_destructive_iterator<V,K,HF,ExK,EqK,A> iterator;
241  typedef typename sparsetable<V>::destructive_iterator     st_iterator;
242
243#ifdef UNDERSTANDS_ITERATOR_TAGS
244  typedef STL_NAMESPACE::forward_iterator_tag iterator_category;
245#endif
246  typedef V value_type;
247  typedef ptrdiff_t difference_type;
248  typedef size_t size_type;
249  typedef V& reference;                // Value
250  typedef V* pointer;
251
252  // "Real" constructor and default constructor
253  sparse_hashtable_destructive_iterator(const sparse_hashtable *h,
254                                        st_iterator it, st_iterator it_end)
255    : ht(h), pos(it), end(it_end)   { advance_past_deleted(); }
256  sparse_hashtable_destructive_iterator() { }          // never used internally
257  // The default destructor is fine; we don't define one
258  // The default operator= is fine; we don't define one
259
260  // Happy dereferencer
261  reference operator*() const { return *pos; }
262  pointer operator->() const { return &(operator*()); }
263
264  // Arithmetic.  The only hard part is making sure that
265  // we're not on a marked-deleted array element
266  void advance_past_deleted() {
267    while ( pos != end && ht->test_deleted(*this) )
268      ++pos;
269  }
270  iterator& operator++()   {
271    assert(pos != end); ++pos; advance_past_deleted(); return *this;
272  }
273  iterator operator++(int) { iterator tmp(*this); ++*this; return tmp; }
274
275  // Comparison.
276  bool operator==(const iterator& it) const { return pos == it.pos; }
277  bool operator!=(const iterator& it) const { return pos != it.pos; }
278
279
280  // The actual data
281  const sparse_hashtable *ht;
282  st_iterator pos, end;
283};
284
285
286
287template <class Value, class Key, class HashFcn,
288          class ExtractKey, class EqualKey, class Alloc>
289class sparse_hashtable {
290 public:
291  typedef Key key_type;
292  typedef Value value_type;
293  typedef HashFcn hasher;
294  typedef EqualKey key_equal;
295
296  typedef size_t            size_type;
297  typedef ptrdiff_t         difference_type;
298  typedef value_type*       pointer;
299  typedef const value_type* const_pointer;
300  typedef value_type&       reference;
301  typedef const value_type& const_reference;
302  typedef sparse_hashtable_iterator<Value, Key, HashFcn,
303                                    ExtractKey, EqualKey, Alloc>
304  iterator;
305
306  typedef sparse_hashtable_const_iterator<Value, Key, HashFcn,
307                                          ExtractKey, EqualKey, Alloc>
308  const_iterator;
309
310  typedef sparse_hashtable_destructive_iterator<Value, Key, HashFcn,
311                                                ExtractKey, EqualKey, Alloc>
312  destructive_iterator;
313
314  // How full we let the table get before we resize.  Knuth says .8 is
315  // good -- higher causes us to probe too much, though saves memory
316  static const float HT_OCCUPANCY_FLT; // = 0.8f;
317
318  // How empty we let the table get before we resize lower.
319  // It should be less than OCCUPANCY_FLT / 2 or we thrash resizing
320  static const float HT_EMPTY_FLT; // = 0.4 * HT_OCCUPANCY_FLT;
321
322  // Minimum size we're willing to let hashtables be.
323  // Must be a power of two, and at least 4.
324  // Note, however, that for a given hashtable, the minimum size is
325  // determined by the first constructor arg, and may be >HT_MIN_BUCKETS.
326  static const size_t HT_MIN_BUCKETS = 32;
327
328
329  // ITERATOR FUNCTIONS
330  iterator begin()             { return iterator(this, table.nonempty_begin(),
331                                                 table.nonempty_end()); }
332  iterator end()               { return iterator(this, table.nonempty_end(),
333                                                 table.nonempty_end()); }
334  const_iterator begin() const { return const_iterator(this,
335                                                       table.nonempty_begin(),
336                                                       table.nonempty_end()); }
337  const_iterator end() const   { return const_iterator(this,
338                                                       table.nonempty_end(),
339                                                       table.nonempty_end()); }
340
341  // This is used when resizing
342  destructive_iterator destructive_begin() {
343    return destructive_iterator(this, table.destructive_begin(),
344                                table.destructive_end());
345  }
346  destructive_iterator destructive_end() {
347    return destructive_iterator(this, table.destructive_end(),
348                                table.destructive_end());
349  }
350
351
352  // ACCESSOR FUNCTIONS for the things we templatize on, basically
353  hasher hash_funct() const { return hash; }
354  key_equal key_eq() const  { return equals; }
355
356  // Annoyingly, we can't copy values around, because they might have
357  // const components (they're probably pair<const X, Y>).  We use
358  // placement new to get around this.  Arg.
359 private:
360  void set_value(value_type* dst, const value_type src) {
361    new(dst) value_type(src);
362  }
363
364  void set_key(key_type* dst, const key_type src) {
365    new(dst) key_type(src);   // used for set_deleted_key(), etc
366  }
367
368  // This is used as a tag for the copy constructor, saying to destroy its
369  // arg We have two ways of destructively copying: with potentially growing
370  // the hashtable as we copy, and without.  To make sure the outside world
371  // can't do a destructive copy, we make the typename private.
372  enum MoveDontCopyT {MoveDontCopy, MoveDontGrow};
373
374
375  // DELETE HELPER FUNCTIONS
376  // This lets the user describe a key that will indicate deleted
377  // table entries.  This key should be an "impossible" entry --
378  // if you try to insert it for real, you won't be able to retrieve it!
379  // (NB: while you pass in an entire value, only the key part is looked
380  // at.  This is just because I don't know how to assign just a key.)
381 private:
382  void squash_deleted() {           // gets rid of any deleted entries we have
383    if ( num_deleted ) {            // get rid of deleted before writing
384      sparse_hashtable tmp(MoveDontGrow, *this);
385      swap(tmp);                    // now we are tmp
386    }
387    assert(num_deleted == 0);
388  }
389
390 public:
391  void set_deleted_key(const key_type &key) {
392    // It's only safe to change what "deleted" means if we purge deleted guys
393    squash_deleted();
394    use_deleted = true;
395    set_key(&delkey, key);         // save the key
396  }
397  void clear_deleted_key() {
398    squash_deleted();
399    use_deleted = false;
400  }
401
402  // These are public so the iterators can use them
403  // True if the item at position bucknum is "deleted" marker
404  bool test_deleted(size_type bucknum) const {
405    // The num_deleted test is crucial for read(): after read(), the ht values
406    // are garbage, and we don't want to think some of them are deleted.
407    return (use_deleted && num_deleted > 0 && table.test(bucknum) &&
408            equals(delkey, get_key(table.get(bucknum))));
409  }
410  bool test_deleted(const iterator &it) const {
411    return (use_deleted && num_deleted > 0 &&
412            equals(delkey, get_key(*it)));
413  }
414  bool test_deleted(const const_iterator &it) const {
415    return (use_deleted && num_deleted > 0 &&
416            equals(delkey, get_key(*it)));
417  }
418  bool test_deleted(const destructive_iterator &it) const {
419    return (use_deleted && num_deleted > 0 &&
420            equals(delkey, get_key(*it)));
421  }
422  // Set it so test_deleted is true.  true if object didn't used to be deleted
423  // See below (at erase()) to explain why we allow const_iterators
424  bool set_deleted(const_iterator &it) {
425    assert(use_deleted);             // bad if set_deleted_key() wasn't called
426    bool retval = !test_deleted(it);
427    // &* converts from iterator to value-type
428    set_key(const_cast<key_type*>(&get_key(*it)), delkey);
429    return retval;
430  }
431  // Set it so test_deleted is false.  true if object used to be deleted
432  bool clear_deleted(const_iterator &it) {
433    assert(use_deleted);             // bad if set_deleted_key() wasn't called
434    // happens automatically when we assign something else in its place
435    return test_deleted(it);
436  }
437
438
439  // FUNCTIONS CONCERNING SIZE
440  size_type size() const      { return table.num_nonempty() - num_deleted; }
441  // Buckets are always a power of 2
442  size_type max_size() const  { return (size_type(-1) >> 1U) + 1; }
443  bool empty() const          { return size() == 0; }
444  size_type bucket_count() const      { return table.size(); }
445  size_type max_bucket_count() const  { return max_size(); }
446
447 private:
448  // Because of the above, size_type(-1) is never legal; use it for errors
449  static const size_type ILLEGAL_BUCKET = size_type(-1);
450
451 private:
452  // This is the smallest size a hashtable can be without being too crowded
453  // If you like, you can give a min #buckets as well as a min #elts
454  size_type min_size(size_type num_elts, size_type min_buckets_wanted) {
455    size_type sz = HT_MIN_BUCKETS;
456    while ( sz < min_buckets_wanted || num_elts >= sz * HT_OCCUPANCY_FLT )
457      sz *= 2;
458    return sz;
459  }
460
461  // Used after a string of deletes
462  void maybe_shrink() {
463    assert(table.num_nonempty() >= num_deleted);
464    assert((bucket_count() & (bucket_count()-1)) == 0); // is a power of two
465    assert(bucket_count() >= HT_MIN_BUCKETS);
466
467    if ( (table.num_nonempty()-num_deleted) <= shrink_threshold &&
468         bucket_count() > HT_MIN_BUCKETS ) {
469      size_type sz = bucket_count() / 2;    // find how much we should shrink
470      while ( sz > HT_MIN_BUCKETS &&
471              (table.num_nonempty() - num_deleted) <= sz * HT_EMPTY_FLT )
472        sz /= 2;                            // stay a power of 2
473      sparse_hashtable tmp(MoveDontCopy, *this, sz);
474      swap(tmp);                            // now we are tmp
475    }
476    consider_shrink = false;                // because we just considered it
477  }
478
479  // We'll let you resize a hashtable -- though this makes us copy all!
480  // When you resize, you say, "make it big enough for this many more elements"
481  void resize_delta(size_type delta, size_type min_buckets_wanted = 0) {
482    if ( consider_shrink )                   // see if lots of deletes happened
483      maybe_shrink();
484    if ( bucket_count() > min_buckets_wanted &&
485         (table.num_nonempty() + delta) <= enlarge_threshold )
486      return;                                // we're ok as we are
487
488    const size_type resize_to = min_size(table.num_nonempty() + delta,
489                                         min_buckets_wanted);
490    if ( resize_to > bucket_count() ) {      // we don't have enough buckets
491      sparse_hashtable tmp(MoveDontCopy, *this, resize_to);
492      swap(tmp);                             // now we are tmp
493    }
494  }
495
496  // Used to actually do the rehashing when we grow/shrink a hashtable
497  void copy_from(const sparse_hashtable &ht, size_type min_buckets_wanted = 0) {
498    clear();            // clear table, set num_deleted to 0
499
500    // If we need to change the size of our table, do it now
501    const size_type resize_to = min_size(ht.size(), min_buckets_wanted);
502    if ( resize_to > bucket_count() ) {      // we don't have enough buckets
503      table.resize(resize_to);               // sets the number of buckets
504      reset_thresholds();
505    }
506
507    // We use a normal iterator to get non-deleted bcks from ht
508    // We could use insert() here, but since we know there are
509    // no duplicates and no deleted items, we can be more efficient
510    assert( (bucket_count() & (bucket_count()-1)) == 0);      // a power of two
511    for ( const_iterator it = ht.begin(); it != ht.end(); ++it ) {
512      size_type num_probes = 0;              // how many times we've probed
513      size_type bucknum;
514      const size_type bucket_count_minus_one = bucket_count() - 1;
515      for (bucknum = hash(get_key(*it)) & bucket_count_minus_one;
516           table.test(bucknum);                          // not empty
517           bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) {
518        ++num_probes;
519        assert(num_probes < bucket_count()); // or else the hashtable is full
520      }
521      table.set(bucknum, *it);               // copies the value to here
522    }
523  }
524
525  // Implementation is like copy_from, but it destroys the table of the
526  // "from" guy by freeing sparsetable memory as we iterate.  This is
527  // useful in resizing, since we're throwing away the "from" guy anyway.
528  void move_from(MoveDontCopyT mover, sparse_hashtable &ht,
529                 size_type min_buckets_wanted = 0) {
530    clear();            // clear table, set num_deleted to 0
531
532    // If we need to change the size of our table, do it now
533    size_t resize_to;
534    if ( mover == MoveDontGrow )
535      resize_to = ht.bucket_count();         // keep same size as old ht
536    else                                     // MoveDontCopy
537      resize_to = min_size(ht.size(), min_buckets_wanted);
538    if ( resize_to > bucket_count() ) {      // we don't have enough buckets
539      table.resize(resize_to);               // sets the number of buckets
540      reset_thresholds();
541    }
542
543    // We use a normal iterator to get non-deleted bcks from ht
544    // We could use insert() here, but since we know there are
545    // no duplicates and no deleted items, we can be more efficient
546    assert( (bucket_count() & (bucket_count()-1)) == 0);      // a power of two
547    // THIS IS THE MAJOR LINE THAT DIFFERS FROM COPY_FROM():
548    for ( destructive_iterator it = ht.destructive_begin();
549          it != ht.destructive_end(); ++it ) {
550      size_type num_probes = 0;              // how many times we've probed
551      size_type bucknum;
552      for ( bucknum = hash(get_key(*it)) & (bucket_count()-1);  // h % buck_cnt
553            table.test(bucknum);                          // not empty
554            bucknum = (bucknum + JUMP_(key, num_probes)) & (bucket_count()-1) ) {
555        ++num_probes;
556        assert(num_probes < bucket_count()); // or else the hashtable is full
557      }
558      table.set(bucknum, *it);               // copies the value to here
559    }
560  }
561
562
563  // Required by the spec for hashed associative container
564 public:
565  // Though the docs say this should be num_buckets, I think it's much
566  // more useful as num_elements.  As a special feature, calling with
567  // req_elements==0 will cause us to shrink if we can, saving space.
568  void resize(size_type req_elements) {       // resize to this or larger
569    if ( consider_shrink || req_elements == 0 )
570      maybe_shrink();
571    if ( req_elements > table.num_nonempty() )    // we only grow
572      resize_delta(req_elements - table.num_nonempty(), 0);
573  }
574
575
576  // CONSTRUCTORS -- as required by the specs, we take a size,
577  // but also let you specify a hashfunction, key comparator,
578  // and key extractor.  We also define a copy constructor and =.
579  // DESTRUCTOR -- the default is fine, surprisingly.
580  explicit sparse_hashtable(size_type n = 0,
581                            const HashFcn& hf = HashFcn(),
582                            const EqualKey& eql = EqualKey(),
583                            const ExtractKey& ext = ExtractKey())
584    : hash(hf), equals(eql), get_key(ext), num_deleted(0),
585      use_deleted(false), delkey(), table(min_size(0, n)) {    // start small
586    reset_thresholds();
587  }
588
589  // As a convenience for resize(), we allow an optional second argument
590  // which lets you make this new hashtable a different size than ht.
591  // We also provide a mechanism of saying you want to "move" the ht argument
592  // into us instead of copying.
593  sparse_hashtable(const sparse_hashtable& ht, size_type min_buckets_wanted = 0)
594    : hash(ht.hash), equals(ht.equals), get_key(ht.get_key),
595      num_deleted(0), use_deleted(ht.use_deleted), delkey(ht.delkey), table() {
596    reset_thresholds();
597    copy_from(ht, min_buckets_wanted);   // copy_from() ignores deleted entries
598  }
599  sparse_hashtable(MoveDontCopyT mover, sparse_hashtable& ht,
600                   size_type min_buckets_wanted=0)
601    : hash(ht.hash), equals(ht.equals), get_key(ht.get_key),
602      num_deleted(0), use_deleted(ht.use_deleted), delkey(ht.delkey), table() {
603    reset_thresholds();
604    move_from(mover, ht, min_buckets_wanted);  // ignores deleted entries
605  }
606
607  sparse_hashtable& operator= (const sparse_hashtable& ht) {
608    if (&ht == this)  return *this;        // don't copy onto ourselves
609    clear();
610    hash = ht.hash;
611    equals = ht.equals;
612    get_key = ht.get_key;
613    use_deleted = ht.use_deleted;
614    set_key(&delkey, ht.delkey);
615    copy_from(ht);                         // sets num_deleted to 0 too
616    return *this;
617  }
618
619  // Many STL algorithms use swap instead of copy constructors
620  void swap(sparse_hashtable& ht) {
621    STL_NAMESPACE::swap(hash, ht.hash);
622    STL_NAMESPACE::swap(equals, ht.equals);
623    STL_NAMESPACE::swap(get_key, ht.get_key);
624    STL_NAMESPACE::swap(num_deleted, ht.num_deleted);
625    STL_NAMESPACE::swap(use_deleted, ht.use_deleted);
626    { key_type tmp;     // for annoying reasons, swap() doesn't work
627      set_key(&tmp, delkey);
628      set_key(&delkey, ht.delkey);
629      set_key(&ht.delkey, tmp);
630    }
631    table.swap(ht.table);
632    reset_thresholds();
633    ht.reset_thresholds();
634  }
635
636  // It's always nice to be able to clear a table without deallocating it
637  void clear() {
638    table.clear();
639    reset_thresholds();
640    num_deleted = 0;
641  }
642
643
644  // LOOKUP ROUTINES
645 private:
646  // Returns a pair of positions: 1st where the object is, 2nd where
647  // it would go if you wanted to insert it.  1st is ILLEGAL_BUCKET
648  // if object is not found; 2nd is ILLEGAL_BUCKET if it is.
649  // Note: because of deletions where-to-insert is not trivial: it's the
650  // first deleted bucket we see, as long as we don't find the key later
651  pair<size_type, size_type> find_position(const key_type &key) const {
652    size_type num_probes = 0;              // how many times we've probed
653    const size_type bucket_count_minus_one = bucket_count() - 1;
654    size_type bucknum = hash(key) & bucket_count_minus_one;
655    size_type insert_pos = ILLEGAL_BUCKET; // where we would insert
656    SPARSEHASH_STAT_UPDATE(total_lookups += 1);
657    while ( 1 ) {                          // probe until something happens
658      if ( !table.test(bucknum) ) {        // bucket is empty
659        SPARSEHASH_STAT_UPDATE(total_probes += num_probes);
660        if ( insert_pos == ILLEGAL_BUCKET )  // found no prior place to insert
661          return pair<size_type,size_type>(ILLEGAL_BUCKET, bucknum);
662        else
663          return pair<size_type,size_type>(ILLEGAL_BUCKET, insert_pos);
664
665      } else if ( test_deleted(bucknum) ) {// keep searching, but mark to insert
666        if ( insert_pos == ILLEGAL_BUCKET )
667          insert_pos = bucknum;
668
669      } else if ( equals(key, get_key(table.get(bucknum))) ) {
670        SPARSEHASH_STAT_UPDATE(total_probes += num_probes);
671        return pair<size_type,size_type>(bucknum, ILLEGAL_BUCKET);
672      }
673      ++num_probes;                        // we're doing another probe
674      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
675      assert(num_probes < bucket_count()); // don't probe too many times!
676    }
677  }
678
679 public:
680  iterator find(const key_type& key) {
681    if ( size() == 0 ) return end();
682    pair<size_type, size_type> pos = find_position(key);
683    if ( pos.first == ILLEGAL_BUCKET )     // alas, not there
684      return end();
685    else
686      return iterator(this, table.get_iter(pos.first), table.nonempty_end());
687  }
688
689  const_iterator find(const key_type& key) const {
690    if ( size() == 0 ) return end();
691    pair<size_type, size_type> pos = find_position(key);
692    if ( pos.first == ILLEGAL_BUCKET )     // alas, not there
693      return end();
694    else
695      return const_iterator(this,
696                            table.get_iter(pos.first), table.nonempty_end());
697  }
698
699  // Counts how many elements have key key.  For maps, it's either 0 or 1.
700  size_type count(const key_type &key) const {
701    pair<size_type, size_type> pos = find_position(key);
702    return pos.first == ILLEGAL_BUCKET ? 0 : 1;
703  }
704
705  // Likewise, equal_range doesn't really make sense for us.  Oh well.
706  pair<iterator,iterator> equal_range(const key_type& key) {
707    const iterator pos = find(key);      // either an iterator or end
708    return pair<iterator,iterator>(pos, pos);
709  }
710  pair<const_iterator,const_iterator> equal_range(const key_type& key) const {
711    const const_iterator pos = find(key);      // either an iterator or end
712    return pair<iterator,iterator>(pos, pos);
713  }
714
715
716  // INSERTION ROUTINES
717 private:
718  // If you know *this is big enough to hold obj, use this routine
719  pair<iterator, bool> insert_noresize(const value_type& obj) {
720    const pair<size_type,size_type> pos = find_position(get_key(obj));
721    if ( pos.first != ILLEGAL_BUCKET) {      // object was already there
722      return pair<iterator,bool>(iterator(this, table.get_iter(pos.first),
723                                          table.nonempty_end()),
724                                 false);          // false: we didn't insert
725    } else {                                 // pos.second says where to put it
726      if ( test_deleted(pos.second) ) {      // just replace if it's been del.
727        // The set() below will undelete this object.  We just worry about stats
728        assert(num_deleted > 0);
729        --num_deleted;                       // used to be, now it isn't
730      }
731      table.set(pos.second, obj);
732      return pair<iterator,bool>(iterator(this, table.get_iter(pos.second),
733                                          table.nonempty_end()),
734                                 true);           // true: we did insert
735    }
736  }
737
738 public:
739  // This is the normal insert routine, used by the outside world
740  pair<iterator, bool> insert(const value_type& obj) {
741    resize_delta(1);                      // adding an object, grow if need be
742    return insert_noresize(obj);
743  }
744
745#ifdef UNDERSTANDS_ITERATOR_TAGS
746  // When inserting a lot at a time, we specialize on the type of iterator
747  template <class InputIterator>
748  void insert(InputIterator f, InputIterator l) {
749    // specializes on iterator type
750    insert(f, l, typename STL_NAMESPACE::iterator_traits<InputIterator>::iterator_category());
751  }
752
753  // Iterator supports operator-, resize before inserting
754  template <class ForwardIterator>
755  void insert(ForwardIterator f, ForwardIterator l,
756              STL_NAMESPACE::forward_iterator_tag) {
757    size_type n = STL_NAMESPACE::distance(f, l);   // TODO(csilvers): standard?
758    resize_delta(n);
759    for ( ; n > 0; --n, ++f)
760      insert_noresize(*f);
761  }
762
763  // Arbitrary iterator, can't tell how much to resize
764  template <class InputIterator>
765  void insert(InputIterator f, InputIterator l,
766              STL_NAMESPACE::input_iterator_tag) {
767    for ( ; f != l; ++f)
768      insert(*f);
769  }
770#else
771  template <class InputIterator>
772  void insert(InputIterator f, InputIterator l) {
773    for ( ; f != l; ++f)
774      insert(*f);
775  }
776#endif
777
778
779  // DELETION ROUTINES
780  size_type erase(const key_type& key) {
781    const_iterator pos = find(key);   // shrug: shouldn't need to be const
782    if ( pos != end() ) {
783      assert(!test_deleted(pos));  // or find() shouldn't have returned it
784      set_deleted(pos);
785      ++num_deleted;
786      consider_shrink = true;      // will think about shrink after next insert
787      return 1;                    // because we deleted one thing
788    } else {
789      return 0;                    // because we deleted nothing
790    }
791  }
792
793  // This is really evil: really it should be iterator, not const_iterator.
794  // But...the only reason keys are const is to allow lookup.
795  // Since that's a moot issue for deleted keys, we allow const_iterators
796  void erase(const_iterator pos) {
797    if ( pos == end() ) return;    // sanity check
798    if ( set_deleted(pos) ) {      // true if object has been newly deleted
799      ++num_deleted;
800      consider_shrink = true;      // will think about shrink after next insert
801    }
802  }
803
804  void erase(const_iterator f, const_iterator l) {
805    for ( ; f != l; ++f) {
806      if ( set_deleted(f)  )       // should always be true
807        ++num_deleted;
808    }
809    consider_shrink = true;        // will think about shrink after next insert
810  }
811
812
813  // COMPARISON
814  bool operator==(const sparse_hashtable& ht) const {
815    // We really want to check that the hash functions are the same
816    // but alas there's no way to do this.  We just hope.
817    return ( num_deleted == ht.num_deleted && table == ht.table );
818  }
819  bool operator!=(const sparse_hashtable& ht) const {
820    return !(*this == ht);
821  }
822
823
824  // I/O
825  // We support reading and writing hashtables to disk.  NOTE that
826  // this only stores the hashtable metadata, not the stuff you've
827  // actually put in the hashtable!  Alas, since I don't know how to
828  // write a hasher or key_equal, you have to make sure everything
829  // but the table is the same.  We compact before writing.
830  bool write_metadata(FILE *fp) {
831    squash_deleted();           // so we don't have to worry about delkey
832    return table.write_metadata(fp);
833  }
834
835  bool read_metadata(FILE *fp) {
836    num_deleted = 0;            // since we got rid before writing
837    bool result = table.read_metadata(fp);
838    reset_thresholds();
839    return result;
840  }
841
842  bool write_nopointer_data(FILE *fp) {
843    return table.write_nopointer_data(fp);
844  }
845
846  bool read_nopointer_data(FILE *fp) {
847    return table.read_nopointer_data(fp);
848  }
849
850 private:
851  // The actual data
852  hasher hash;                      // required by hashed_associative_container
853  key_equal equals;
854  ExtractKey get_key;
855  size_type num_deleted;        // how many occupied buckets are marked deleted
856  bool use_deleted;                          // false until delkey has been set
857  key_type delkey;                           // which key marks deleted entries
858  sparsetable<value_type> table;      // holds num_buckets and num_elements too
859  size_type shrink_threshold;                    // table.size() * HT_EMPTY_FLT
860  size_type enlarge_threshold;               // table.size() * HT_OCCUPANCY_FLT
861  bool consider_shrink;   // true if we should try to shrink before next insert
862
863  void reset_thresholds() {
864    enlarge_threshold = static_cast<size_type>(table.size()*HT_OCCUPANCY_FLT);
865    shrink_threshold = static_cast<size_type>(table.size()*HT_EMPTY_FLT);
866    consider_shrink = false;   // whatever caused us to reset already considered
867  }
868};
869
870// We need a global swap as well
871template <class V, class K, class HF, class ExK, class EqK, class A>
872inline void swap(sparse_hashtable<V,K,HF,ExK,EqK,A> &x,
873                 sparse_hashtable<V,K,HF,ExK,EqK,A> &y) {
874  x.swap(y);
875}
876
877#undef JUMP_
878
879template <class V, class K, class HF, class ExK, class EqK, class A>
880const typename sparse_hashtable<V,K,HF,ExK,EqK,A>::size_type
881  sparse_hashtable<V,K,HF,ExK,EqK,A>::ILLEGAL_BUCKET;
882
883// How full we let the table get before we resize.  Knuth says .8 is
884// good -- higher causes us to probe too much, though saves memory
885template <class V, class K, class HF, class ExK, class EqK, class A>
886const float sparse_hashtable<V,K,HF,ExK,EqK,A>::HT_OCCUPANCY_FLT = 0.8f;
887
888// How empty we let the table get before we resize lower.
889// It should be less than OCCUPANCY_FLT / 2 or we thrash resizing
890template <class V, class K, class HF, class ExK, class EqK, class A>
891const float sparse_hashtable<V,K,HF,ExK,EqK,A>::HT_EMPTY_FLT = 0.4 *
892sparse_hashtable<V,K,HF,ExK,EqK,A>::HT_OCCUPANCY_FLT;
893
894_END_GOOGLE_NAMESPACE_
895
896#endif /* _SPARSEHASHTABLE_H_ */
Note: See TracBrowser for help on using the repository browser.