source: GTP/trunk/Lib/Vis/Preprocessing/src/sparsehash/experimental/libchash.h @ 2162

Revision 2162, 12.4 KB checked in by mattausch, 18 years ago (diff)

improved hash performance with google hashmap

Line 
1/* Copyright (c) 1998 - 2005, Google Inc.
2 * All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ---
31 * Author: Craig Silverstein
32 *
33 *  This library is intended to be used for in-memory hash tables,
34 *  though it provides rudimentary permanent-storage capabilities.
35 *  It attempts to be fast, portable, and small.  The best algorithm
36 *  to fulfill these goals is an internal probing hashing algorithm,
37 *  as in Knuth, _Art of Computer Programming_, vol III.  Unlike
38 *  chained (open) hashing, it doesn't require a pointer for every
39 *  item, yet it is still constant time lookup in practice.
40 *
41 *  Also to save space, we let the contents (both data and key) that
42 *  you insert be a union: if the key/data is small, we store it
43 *  directly in the hashtable, otherwise we store a pointer to it.
44 *  To keep you from having to figure out which, use KEY_PTR and
45 *  PTR_KEY to convert between the arguments to these functions and
46 *  a pointer to the real data.  For instance:
47 *     char key[] = "ab", *key2;
48 *     HTItem *bck; HashTable *ht;
49 *     HashInsert(ht, PTR_KEY(ht, key), 0);
50 *     bck = HashFind(ht, PTR_KEY(ht, "ab"));
51 *     key2 = KEY_PTR(ht, bck->key);
52 *
53 *  There are a rich set of operations supported:
54 *     AllocateHashTable() -- Allocates a hashtable structure and
55 *                            returns it.
56 *        cchKey: if it's a positive number, then each key is a
57 *                fixed-length record of that length.  If it's 0,
58 *                the key is assumed to be a \0-terminated string.
59 *        fSaveKey: normally, you are responsible for allocating
60 *                  space for the key.  If this is 1, we make a
61 *                  copy of the key for you.
62 *     ClearHashTable() -- Removes everything from a hashtable
63 *     FreeHashTable() -- Frees memory used by a hashtable
64 *
65 *     HashFind() -- takes a key (use PTR_KEY) and returns the
66 *                   HTItem containing that key, or NULL if the
67 *                   key is not in the hashtable.
68 *     HashFindLast() -- returns the item found by last HashFind()
69 *     HashFindOrInsert() -- inserts the key/data pair if the key
70 *                           is not already in the hashtable, or
71 *                           returns the appropraite HTItem if it is.
72 *     HashFindOrInsertItem() -- takes key/data as an HTItem.
73 *     HashInsert() -- adds a key/data pair to the hashtable.  What
74 *                     it does if the key is already in the table
75 *                     depends on the value of SAMEKEY_OVERWRITE.
76 *     HashInsertItem() -- takes key/data as an HTItem.
77 *     HashDelete() -- removes a key/data pair from the hashtable,
78 *                     if it's there.  RETURNS 1 if it was there,
79 *                     0 else.
80 *        If you use sparse tables and never delete, the full data
81 *        space is available.  Otherwise we steal -2 (maybe -3),
82 *        so you can't have data fields with those values.
83 *     HashDeleteLast() -- deletes the item returned by the last Find().
84 *
85 *     HashFirstBucket() -- used to iterate over the buckets in a
86 *                          hashtable.  DON'T INSERT OR DELETE WHILE
87 *                          ITERATING!  You can't nest iterations.
88 *     HashNextBucket() -- RETURNS NULL at the end of iterating.
89 *
90 *     HashSetDeltaGoalSize() -- if you're going to insert 1000 items
91 *                               at once, call this fn with arg 1000.
92 *                               It grows the table more intelligently.
93 *
94 *     HashSave() -- saves the hashtable to a file.  It saves keys ok,
95 *                   but it doesn't know how to interpret the data field,
96 *                   so if the data field is a pointer to some complex
97 *                   structure, you must send a function that takes a
98 *                   file pointer and a pointer to the structure, and
99 *                   write whatever you want to write.  It should return
100 *                   the number of bytes written.  If the file is NULL,
101 *                   it should just return the number of bytes it would
102 *                   write, without writing anything.
103 *                      If your data field is just an integer, not a
104 *                   pointer, just send NULL for the function.
105 *     HashLoad() -- loads a hashtable.  It needs a function that takes
106 *                   a file and the size of the structure, and expects
107 *                   you to read in the structure and return a pointer
108 *                   to it.  You must do memory allocation, etc.  If
109 *                   the data is just a number, send NULL.
110 *     HashLoadKeys() -- unlike HashLoad(), doesn't load the data off disk
111 *                       until needed.  This saves memory, but if you look
112 *                       up the same key a lot, it does a disk access each
113 *                       time.
114 *        You can't do Insert() or Delete() on hashtables that were loaded
115 *        from disk.
116 */
117
118#include <sys/types.h>         /* includes definition of "ulong", we hope */
119#define ulong u_long
120
121#define MAGIC_KEY             "CHsh"   /* when we save the file */
122
123#ifndef LOG_WORD_SIZE                  /* 5 for 32 bit words, 6 for 64 */
124#ifdef __alpha                         /* only way I know of determining */
125#define LOG_WORD_SIZE          6       /* log_2(sizeof(ulong)) [in bits] */
126#else
127#define LOG_WORD_SIZE          5       /* log_2(sizeof(ulong)) [in bits] */
128#endif
129#endif
130
131   /* The following gives a speed/time tradeoff: how many buckets are  *
132    * in each bin.  0 gives 32 buckets/bin, which is a good number.    */
133#ifndef LOG_BM_WORDS
134#define LOG_BM_WORDS        0      /* each group has 2^L_B_W * 32 buckets */
135#endif
136
137   /* The following are all parameters that affect performance. */
138#ifndef JUMP
139#define JUMP(key, offset)   ( ++(offset) )  /* ( 1 ) for linear hashing */
140#endif
141#ifndef Table
142#define Table(x)            Sparse##x       /* Dense##x for dense tables */
143#endif
144#ifndef FAST_DELETE
145#define FAST_DELETE         0      /* if it's 1, we never shrink the ht */
146#endif
147#ifndef SAMEKEY_OVERWRITE
148#define SAMEKEY_OVERWRITE   1      /* overwrite item with our key on insert? */
149#endif
150#ifndef OCCUPANCY_PCT
151#define OCCUPANCY_PCT       0.5    /* large PCT means smaller and slower */
152#endif
153#ifndef MIN_HASH_SIZE
154#define MIN_HASH_SIZE       512    /* ht size when first created */
155#endif
156   /* When deleting a bucket, we can't just empty it (future hashes  *
157    * may fail); instead we set the data field to DELETED.  Thus you *
158    * should set DELETED to a data value you never use.  Better yet, *
159    * if you don't need to delete, define INSERT_ONLY.               */
160#ifndef INSERT_ONLY
161#define DELETED                   -2UL
162#define IS_BCK_DELETED(bck)       ( (bck) && (bck)->data == DELETED )
163#define SET_BCK_DELETED(ht, bck)  do { (bck)->data = DELETED;                \
164                                       FREE_KEY(ht, (bck)->key); } while ( 0 )
165#else
166#define IS_BCK_DELETED(bck)       0
167#define SET_BCK_DELETED(ht, bck)  \
168   do { fprintf(stderr, "Deletion not supported for insert-only hashtable\n");\
169        exit(2); } while ( 0 )
170#endif
171
172   /* We need the following only for dense buckets (Dense##x above).  *
173    * If you need to, set this to a value you'll never use for data.  */
174#define EMPTY -3UL                /* steal more of the bck->data space */
175
176
177   /* This is what an item is.  Either can be cast to a pointer. */
178typedef struct {
179   ulong data;        /* 4 bytes for data: either a pointer or an integer */
180   ulong key;         /* 4 bytes for the key: either a pointer or an int */
181} HTItem;
182
183struct Table(Bin);                            /* defined in chash.c, I hope */
184struct Table(Iterator);
185typedef struct Table(Bin)       Table;        /* Expands to SparseBin, etc */
186typedef struct Table(Iterator)  TableIterator;
187
188   /* for STORES_PTR to work ok, cchKey MUST BE DEFINED 1st, cItems 2nd! */
189typedef struct HashTable {
190   ulong cchKey;        /* the length of the key, or if it's \0 terminated */
191   ulong cItems;        /* number of items currently in the hashtable */
192   ulong cDeletedItems; /* # of buckets holding DELETE in the hashtable */
193   ulong cBuckets;      /* size of the table */
194   Table *table;        /* The actual contents of the hashtable */
195   int fSaveKeys;       /* 1 if we copy keys locally; 2 if keys in one block */
196   int cDeltaGoalSize;  /* # of coming inserts (or deletes, if <0) we expect */
197   HTItem *posLastFind; /* position of last Find() command */
198   TableIterator *iter; /* used in First/NextBucket */
199
200   FILE *fpData;        /* if non-NULL, what item->data points into */
201   char * (*dataRead)(FILE *, int);   /* how to load data from disk */
202   HTItem bckData;      /* holds data after being loaded from disk */
203} HashTable;
204
205   /* Small keys are stored and passed directly, but large keys are
206    * stored and passed as pointers.  To make it easier to remember
207    * what to pass, we provide two functions:
208    *   PTR_KEY: give it a pointer to your data, and it returns
209    *            something appropriate to send to Hash() functions or
210    *            be stored in a data field.
211    *   KEY_PTR: give it something returned by a Hash() routine, and
212    *            it returns a (char *) pointer to the actual data.
213    */
214#define HashKeySize(ht)   ( ((ulong *)(ht))[0] )  /* this is how we inline */
215#define HashSize(ht)      ( ((ulong *)(ht))[1] )  /* ...a la C++ :-) */
216
217#define STORES_PTR(ht)    ( HashKeySize(ht) == 0 || \
218                            HashKeySize(ht) > sizeof(ulong) )
219#define KEY_PTR(ht, key)  ( STORES_PTR(ht) ? (char *)(key) : (char *)&(key) )
220#ifdef DONT_HAVE_TO_WORRY_ABOUT_BUS_ERRORS
221#define PTR_KEY(ht, ptr)  ( STORES_PTR(ht) ? (ulong)(ptr) : *(ulong *)(ptr) )
222#else
223#define PTR_KEY(ht, ptr)  ( STORES_PTR(ht) ? (ulong)(ptr) : HTcopy((char *)ptr))
224#endif
225
226
227   /* Function prototypes */
228unsigned long HTcopy(char *pul);         /* for PTR_KEY, not for users */
229
230struct HashTable *AllocateHashTable(int cchKey, int fSaveKeys);
231void ClearHashTable(struct HashTable *ht);
232void FreeHashTable(struct HashTable *ht);
233
234HTItem *HashFind(struct HashTable *ht, ulong key);
235HTItem *HashFindLast(struct HashTable *ht);
236HTItem *HashFindOrInsert(struct HashTable *ht, ulong key, ulong dataInsert);
237HTItem *HashFindOrInsertItem(struct HashTable *ht, HTItem *pItem);
238
239HTItem *HashInsert(struct HashTable *ht, ulong key, ulong data);
240HTItem *HashInsertItem(struct HashTable *ht, HTItem *pItem);
241
242int HashDelete(struct HashTable *ht, ulong key);
243int HashDeleteLast(struct HashTable *ht);
244
245HTItem *HashFirstBucket(struct HashTable *ht);
246HTItem *HashNextBucket(struct HashTable *ht);
247
248int HashSetDeltaGoalSize(struct HashTable *ht, int delta);
249
250void HashSave(FILE *fp, struct HashTable *ht, int (*write)(FILE *, char *));
251struct HashTable *HashLoad(FILE *fp, char * (*read)(FILE *, int));
252struct HashTable *HashLoadKeys(FILE *fp, char * (*read)(FILE *, int));
Note: See TracBrowser for help on using the repository browser.