Context Navigation

libchash.h @ 2162

Revision 2162, 12.4 KB checked in by mattausch, 18 years ago (diff)
improved hash performance with google hashmap

Rev	Line
[2162]	1	/* Copyright (c) 1998 - 2005, Google Inc.
	2	* All rights reserved.
	3	*
	4	* Redistribution and use in source and binary forms, with or without
	5	* modification, are permitted provided that the following conditions are
	6	* met:
	7	*
	8	* * Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* * Redistributions in binary form must reproduce the above
	11	* copyright notice, this list of conditions and the following disclaimer
	12	* in the documentation and/or other materials provided with the
	13	* distribution.
	14	* * Neither the name of Google Inc. nor the names of its
	15	* contributors may be used to endorse or promote products derived from
	16	* this software without specific prior written permission.
	17	*
	18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	19	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	20	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	21	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	22	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	23	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	24	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	28	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	29	*
	30	* ---
	31	* Author: Craig Silverstein
	32	*
	33	* This library is intended to be used for in-memory hash tables,
	34	* though it provides rudimentary permanent-storage capabilities.
	35	* It attempts to be fast, portable, and small. The best algorithm
	36	* to fulfill these goals is an internal probing hashing algorithm,
	37	* as in Knuth, _Art of Computer Programming_, vol III. Unlike
	38	* chained (open) hashing, it doesn't require a pointer for every
	39	* item, yet it is still constant time lookup in practice.
	40	*
	41	* Also to save space, we let the contents (both data and key) that
	42	* you insert be a union: if the key/data is small, we store it
	43	* directly in the hashtable, otherwise we store a pointer to it.
	44	* To keep you from having to figure out which, use KEY_PTR and
	45	* PTR_KEY to convert between the arguments to these functions and
	46	* a pointer to the real data. For instance:
	47	* char key[] = "ab", *key2;
	48	* HTItem bck; HashTable ht;
	49	* HashInsert(ht, PTR_KEY(ht, key), 0);
	50	* bck = HashFind(ht, PTR_KEY(ht, "ab"));
	51	* key2 = KEY_PTR(ht, bck->key);
	52	*
	53	* There are a rich set of operations supported:
	54	* AllocateHashTable() -- Allocates a hashtable structure and
	55	* returns it.
	56	* cchKey: if it's a positive number, then each key is a
	57	* fixed-length record of that length. If it's 0,
	58	* the key is assumed to be a \0-terminated string.
	59	* fSaveKey: normally, you are responsible for allocating
	60	* space for the key. If this is 1, we make a
	61	* copy of the key for you.
	62	* ClearHashTable() -- Removes everything from a hashtable
	63	* FreeHashTable() -- Frees memory used by a hashtable
	64	*
	65	* HashFind() -- takes a key (use PTR_KEY) and returns the
	66	* HTItem containing that key, or NULL if the
	67	* key is not in the hashtable.
	68	* HashFindLast() -- returns the item found by last HashFind()
	69	* HashFindOrInsert() -- inserts the key/data pair if the key
	70	* is not already in the hashtable, or
	71	* returns the appropraite HTItem if it is.
	72	* HashFindOrInsertItem() -- takes key/data as an HTItem.
	73	* HashInsert() -- adds a key/data pair to the hashtable. What
	74	* it does if the key is already in the table
	75	* depends on the value of SAMEKEY_OVERWRITE.
	76	* HashInsertItem() -- takes key/data as an HTItem.
	77	* HashDelete() -- removes a key/data pair from the hashtable,
	78	* if it's there. RETURNS 1 if it was there,
	79	* 0 else.
	80	* If you use sparse tables and never delete, the full data
	81	* space is available. Otherwise we steal -2 (maybe -3),
	82	* so you can't have data fields with those values.
	83	* HashDeleteLast() -- deletes the item returned by the last Find().
	84	*
	85	* HashFirstBucket() -- used to iterate over the buckets in a
	86	* hashtable. DON'T INSERT OR DELETE WHILE
	87	* ITERATING! You can't nest iterations.
	88	* HashNextBucket() -- RETURNS NULL at the end of iterating.
	89	*
	90	* HashSetDeltaGoalSize() -- if you're going to insert 1000 items
	91	* at once, call this fn with arg 1000.
	92	* It grows the table more intelligently.
	93	*
	94	* HashSave() -- saves the hashtable to a file. It saves keys ok,
	95	* but it doesn't know how to interpret the data field,
	96	* so if the data field is a pointer to some complex
	97	* structure, you must send a function that takes a
	98	* file pointer and a pointer to the structure, and
	99	* write whatever you want to write. It should return
	100	* the number of bytes written. If the file is NULL,
	101	* it should just return the number of bytes it would
	102	* write, without writing anything.
	103	* If your data field is just an integer, not a
	104	* pointer, just send NULL for the function.
	105	* HashLoad() -- loads a hashtable. It needs a function that takes
	106	* a file and the size of the structure, and expects
	107	* you to read in the structure and return a pointer
	108	* to it. You must do memory allocation, etc. If
	109	* the data is just a number, send NULL.
	110	* HashLoadKeys() -- unlike HashLoad(), doesn't load the data off disk
	111	* until needed. This saves memory, but if you look
	112	* up the same key a lot, it does a disk access each
	113	* time.
	114	* You can't do Insert() or Delete() on hashtables that were loaded
	115	* from disk.
	116	*/
	117
	118	#include <sys/types.h> /* includes definition of "ulong", we hope */
	119	#define ulong u_long
	120
	121	#define MAGIC_KEY "CHsh" /* when we save the file */
	122
	123	#ifndef LOG_WORD_SIZE /* 5 for 32 bit words, 6 for 64 */
	124	#ifdef __alpha /* only way I know of determining */
	125	#define LOG_WORD_SIZE 6 /* log_2(sizeof(ulong)) [in bits] */
	126	#else
	127	#define LOG_WORD_SIZE 5 /* log_2(sizeof(ulong)) [in bits] */
	128	#endif
	129	#endif
	130
	131	/* The following gives a speed/time tradeoff: how many buckets are *
	132	* in each bin. 0 gives 32 buckets/bin, which is a good number. */
	133	#ifndef LOG_BM_WORDS
	134	#define LOG_BM_WORDS 0 /* each group has 2^L_B_W * 32 buckets */
	135	#endif
	136
	137	/* The following are all parameters that affect performance. */
	138	#ifndef JUMP
	139	#define JUMP(key, offset) ( ++(offset) ) /* ( 1 ) for linear hashing */
	140	#endif
	141	#ifndef Table
	142	#define Table(x) Sparse##x /* Dense##x for dense tables */
	143	#endif
	144	#ifndef FAST_DELETE
	145	#define FAST_DELETE 0 /* if it's 1, we never shrink the ht */
	146	#endif
	147	#ifndef SAMEKEY_OVERWRITE
	148	#define SAMEKEY_OVERWRITE 1 /* overwrite item with our key on insert? */
	149	#endif
	150	#ifndef OCCUPANCY_PCT
	151	#define OCCUPANCY_PCT 0.5 /* large PCT means smaller and slower */
	152	#endif
	153	#ifndef MIN_HASH_SIZE
	154	#define MIN_HASH_SIZE 512 /* ht size when first created */
	155	#endif
	156	/* When deleting a bucket, we can't just empty it (future hashes *
	157	* may fail); instead we set the data field to DELETED. Thus you *
	158	* should set DELETED to a data value you never use. Better yet, *
	159	* if you don't need to delete, define INSERT_ONLY. */
	160	#ifndef INSERT_ONLY
	161	#define DELETED -2UL
	162	#define IS_BCK_DELETED(bck) ( (bck) && (bck)->data == DELETED )
	163	#define SET_BCK_DELETED(ht, bck) do { (bck)->data = DELETED; \
	164	FREE_KEY(ht, (bck)->key); } while ( 0 )
	165	#else
	166	#define IS_BCK_DELETED(bck) 0
	167	#define SET_BCK_DELETED(ht, bck) \
	168	do { fprintf(stderr, "Deletion not supported for insert-only hashtable\n");\
	169	exit(2); } while ( 0 )
	170	#endif
	171
	172	/* We need the following only for dense buckets (Dense##x above). *
	173	* If you need to, set this to a value you'll never use for data. */
	174	#define EMPTY -3UL /* steal more of the bck->data space */
	175
	176
	177	/* This is what an item is. Either can be cast to a pointer. */
	178	typedef struct {
	179	ulong data; /* 4 bytes for data: either a pointer or an integer */
	180	ulong key; /* 4 bytes for the key: either a pointer or an int */
	181	} HTItem;
	182
	183	struct Table(Bin); /* defined in chash.c, I hope */
	184	struct Table(Iterator);
	185	typedef struct Table(Bin) Table; /* Expands to SparseBin, etc */
	186	typedef struct Table(Iterator) TableIterator;
	187
	188	/* for STORES_PTR to work ok, cchKey MUST BE DEFINED 1st, cItems 2nd! */
	189	typedef struct HashTable {
	190	ulong cchKey; /* the length of the key, or if it's \0 terminated */
	191	ulong cItems; /* number of items currently in the hashtable */
	192	ulong cDeletedItems; /* # of buckets holding DELETE in the hashtable */
	193	ulong cBuckets; /* size of the table */
	194	Table table; / The actual contents of the hashtable */
	195	int fSaveKeys; /* 1 if we copy keys locally; 2 if keys in one block */
	196	int cDeltaGoalSize; /* # of coming inserts (or deletes, if <0) we expect */
	197	HTItem posLastFind; / position of last Find() command */
	198	TableIterator iter; / used in First/NextBucket */
	199
	200	FILE fpData; / if non-NULL, what item->data points into */
	201	char * (dataRead)(FILE , int); /* how to load data from disk */
	202	HTItem bckData; /* holds data after being loaded from disk */
	203	} HashTable;
	204
	205	/* Small keys are stored and passed directly, but large keys are
	206	* stored and passed as pointers. To make it easier to remember
	207	* what to pass, we provide two functions:
	208	* PTR_KEY: give it a pointer to your data, and it returns
	209	* something appropriate to send to Hash() functions or
	210	* be stored in a data field.
	211	* KEY_PTR: give it something returned by a Hash() routine, and
	212	* it returns a (char *) pointer to the actual data.
	213	*/
	214	#define HashKeySize(ht) ( ((ulong )(ht))[0] ) / this is how we inline */
	215	#define HashSize(ht) ( ((ulong )(ht))[1] ) / ...a la C++ :-) */
	216
	217	#define STORES_PTR(ht) ( HashKeySize(ht) == 0 \|\| \
	218	HashKeySize(ht) > sizeof(ulong) )
	219	#define KEY_PTR(ht, key) ( STORES_PTR(ht) ? (char )(key) : (char )&(key) )
	220	#ifdef DONT_HAVE_TO_WORRY_ABOUT_BUS_ERRORS
	221	#define PTR_KEY(ht, ptr) ( STORES_PTR(ht) ? (ulong)(ptr) : (ulong )(ptr) )
	222	#else
	223	#define PTR_KEY(ht, ptr) ( STORES_PTR(ht) ? (ulong)(ptr) : HTcopy((char *)ptr))
	224	#endif
	225
	226
	227	/* Function prototypes */
	228	unsigned long HTcopy(char pul); / for PTR_KEY, not for users */
	229
	230	struct HashTable *AllocateHashTable(int cchKey, int fSaveKeys);
	231	void ClearHashTable(struct HashTable *ht);
	232	void FreeHashTable(struct HashTable *ht);
	233
	234	HTItem HashFind(struct HashTable ht, ulong key);
	235	HTItem HashFindLast(struct HashTable ht);
	236	HTItem HashFindOrInsert(struct HashTable ht, ulong key, ulong dataInsert);
	237	HTItem HashFindOrInsertItem(struct HashTable ht, HTItem *pItem);
	238
	239	HTItem HashInsert(struct HashTable ht, ulong key, ulong data);
	240	HTItem HashInsertItem(struct HashTable ht, HTItem *pItem);
	241
	242	int HashDelete(struct HashTable *ht, ulong key);
	243	int HashDeleteLast(struct HashTable *ht);
	244
	245	HTItem HashFirstBucket(struct HashTable ht);
	246	HTItem HashNextBucket(struct HashTable ht);
	247
	248	int HashSetDeltaGoalSize(struct HashTable *ht, int delta);
	249
	250	void HashSave(FILE fp, struct HashTable ht, int (write)(FILE , char *));
	251	struct HashTable HashLoad(FILE fp, char * (read)(FILE , int));
	252	struct HashTable HashLoadKeys(FILE fp, char * (read)(FILE , int));

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: GTP/trunk/Lib/Vis/Preprocessing/src/sparsehash/experimental/libchash.h @ 2162

Download in other formats: