Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

hashtable_unittest.cc @ 2162

Revision 2162, 17.6 KB checked in by mattausch, 17 years ago (diff)
improved hash performance with google hashmap

Line
1	// Copyright (c) 2005, Google Inc.
2	// All rights reserved.
3	//
4	// Redistribution and use in source and binary forms, with or without
5	// modification, are permitted provided that the following conditions are
6	// met:
7	//
8	// * Redistributions of source code must retain the above copyright
9	// notice, this list of conditions and the following disclaimer.
10	// * Redistributions in binary form must reproduce the above
11	// copyright notice, this list of conditions and the following disclaimer
12	// in the documentation and/or other materials provided with the
13	// distribution.
14	// * Neither the name of Google Inc. nor the names of its
15	// contributors may be used to endorse or promote products derived from
16	// this software without specific prior written permission.
17	//
18	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30	// ---
31	// Author: Craig Silverstein
32	//
33	// This tests <google/sparsehash/densehashtable.h>
34	// This tests <google/dense_hash_set>
35	// This tests <google/dense_hash_map>
36	// This tests <google/sparsehash/sparsehashtable.h>
37	// This tests <google/sparse_hash_set>
38	// This tests <google/sparse_hash_map>
39
40	// Since {dense,sparse}hashtable is templatized, it's important that
41	// we test every function in every class in this file -- not just to
42	// see if it works, but even if it compiles.
43
44	#include <google/sparsehash/config.h>
45	#include <stdio.h>
46	#include <sys/stat.h> // for stat()
47	#ifdef HAVE_UNISTD_H
48	#include <unistd.h> // for unlink()
49	#endif
50	#include <string.h>
51	#include <time.h> // for silly random-number-seed generator
52	#include <math.h> // for sqrt()
53	#include <map>
54	#include <iterator> // for insert_iterator
55	#include <iostream>
56	#include <iomanip> // for setprecision()
57	#include <string>
58	#include <google/sparsehash/hash_fun.h>
59	#include <google/dense_hash_map>
60	#include <google/dense_hash_set>
61	#include <google/sparsehash/densehashtable.h>
62	#include <google/sparse_hash_map>
63	#include <google/sparse_hash_set>
64	#include <google/sparsehash/sparsehashtable.h>
65
66	using GOOGLE_NAMESPACE::sparse_hash_map;
67	using GOOGLE_NAMESPACE::dense_hash_map;
68	using GOOGLE_NAMESPACE::sparse_hash_set;
69	using GOOGLE_NAMESPACE::dense_hash_set;
70	using GOOGLE_NAMESPACE::sparse_hashtable;
71	using GOOGLE_NAMESPACE::dense_hashtable;
72	using STL_NAMESPACE::map;
73	using STL_NAMESPACE::pair;
74	using STL_NAMESPACE::string;
75	using STL_NAMESPACE::insert_iterator;
76	using STL_NAMESPACE::allocator;
77	using STL_NAMESPACE::equal_to;
78	using STL_NAMESPACE::ostream;
79
80	#define LOGF STL_NAMESPACE::cout // where we log to; LOGF is a historical name
81
82	#define CHECK(cond) do { \
83	if (!(cond)) { \
84	LOGF << "Test failed: " #cond "\n"; \
85	exit(1); \
86	} \
87	} while (0)
88
89	const char *words[] = {"Baffin\n", // in /usr/dict/words
90	"Boffin\n", // not in
91	"baffin\n", // not in
92	"genial\n", // last word in
93	"Aarhus\n", // first word alphabetically
94	"Zurich\n", // last word alphabetically
95	};
96
97	const char *nwords[] = {"Boffin\n",
98	"baffin\n",
99	};
100
101	// Apparently identity is not stl-standard, so we define our own
102	template<class Value>
103	struct Identity {
104	Value& operator()(Value& v) const { return v; }
105	const Value& operator()(const Value& v) const { return v; }
106	};
107
108	// Let us log the pairs that make up a hash_map
109	template<class P1, class P2>
110	ostream& operator<<(ostream& s, const pair<P1, P2>& p) {
111	s << "pair(" << p.first << ", " << p.second << ")";
112	return s;
113	}
114
115	struct strcmp_fnc {
116	bool operator()(const char* s1, const char* s2) const {
117	return ((s1 == 0 && s2 == 0) \|\|
118	(s1 && s2 && s1 == s2 && strcmp(s1, s2) == 0));
119	}
120	};
121
122
123	template <class SparseTable, class T>
124	static void set_empty_key(SparseTable *sp, T val) {
125	}
126
127	template <class T, class H, class I, class C, class A>
128	static void set_empty_key(dense_hashtable<T,T,H,I,C,A> *ht, T val) {
129	ht->set_empty_key(val);
130	}
131
132	template <class T, class H, class C>
133	static void set_empty_key(dense_hash_set<T,H,C> *ht, T val) {
134	ht->set_empty_key(val);
135	}
136
137	template <class K, class V, class H, class C>
138	static void set_empty_key(dense_hash_map<K,V,H,C> *ht, K val) {
139	ht->set_empty_key(val);
140	}
141
142
143	template <class T, class H, class I, class C, class A>
144	static void insert(dense_hashtable<T,T,H,I,C,A> *ht, T val) {
145	ht->insert(val);
146	}
147
148	template <class T, class H, class C>
149	static void insert(dense_hash_set<T,H,C> *ht, T val) {
150	ht->insert(val);
151	}
152
153	template <class K, class V, class H, class C>
154	static void insert(dense_hash_map<K,V,H,C> *ht, K val) {
155	ht->insert(pair<K,V>(val,V()));
156	}
157
158	template <class T, class H, class I, class C, class A>
159	static void insert(sparse_hashtable<T,T,H,I,C,A> *ht, T val) {
160	ht->insert(val);
161	}
162
163	template <class T, class H, class C>
164	static void insert(sparse_hash_set<T,H,C> *ht, T val) {
165	ht->insert(val);
166	}
167
168	template <class K, class V, class H, class C>
169	static void insert(sparse_hash_map<K,V,H,C> *ht, K val) {
170	ht->insert(pair<K,V>(val,V()));
171	}
172
173	template <class HT, class Iterator>
174	static void insert(HT *ht, Iterator begin, Iterator end) {
175	ht->insert(begin, end);
176	}
177
178	// For hashtable's and hash_set's, the iterator insert works fine (and
179	// is used). But for the hash_map's, the iterator insert expects the
180	// iterators to point to pair's. So by looping over and calling insert
181	// on each element individually, the code below automatically expands
182	// into inserting a pair.
183	template <class K, class V, class H, class C, class Iterator>
184	static void insert(dense_hash_map<K,V,H,C> *ht, Iterator begin, Iterator end) {
185	while (begin != end) {
186	insert(ht, *begin);
187	++begin;
188	}
189	}
190
191	template <class K, class V, class H, class C, class Iterator>
192	static void insert(sparse_hash_map<K,V,H,C> *ht, Iterator begin, Iterator end) {
193	while (begin != end) {
194	insert(ht, *begin);
195	++begin;
196	}
197	}
198
199	// A version of insert that uses the insert_iterator. But insert_iterator
200	// isn't defined for the low level hashtable classes, so we just punt to insert.
201
202	template <class T, class H, class I, class C, class A>
203	static void iterator_insert(dense_hashtable<T,T,H,I,C,A>* ht, T val,
204	insert_iterator<dense_hashtable<T,T,H,I,C,A> >* ) {
205	ht->insert(val);
206	}
207
208	template <class T, class H, class C>
209	static void iterator_insert(dense_hash_set<T,H,C>* , T val,
210	insert_iterator<dense_hash_set<T,H,C> >* ii) {
211	(ii)++ = val;
212	}
213
214	template <class K, class V, class H, class C>
215	static void iterator_insert(dense_hash_map<K,V,H,C>* , K val,
216	insert_iterator<dense_hash_map<K,V,H,C> >* ii) {
217	(ii)++ = pair<K,V>(val,V());
218	}
219
220	template <class T, class H, class I, class C, class A>
221	static void iterator_insert(sparse_hashtable<T,T,H,I,C,A>* ht, T val,
222	insert_iterator<sparse_hashtable<T,T,H,I,C,A> >* ) {
223	ht->insert(val);
224	}
225
226	template <class T, class H, class C>
227	static void iterator_insert(sparse_hash_set<T,H,C>* , T val,
228	insert_iterator<sparse_hash_set<T,H,C> >* ii) {
229	(ii)++ = val;
230	}
231
232	template <class K, class V, class H, class C>
233	static void iterator_insert(sparse_hash_map<K,V,H,C> *, K val,
234	insert_iterator<sparse_hash_map<K,V,H,C> >* ii) {
235	(ii)++ = pair<K,V>(val,V());
236	}
237
238
239	static void write_item(FILE fp, const char val) {
240	fwrite(val, strlen(val), 1, fp); // \n serves to separate
241	}
242
243	// The weird 'const' declarations are desired by the compiler. Yucko.
244	static void write_item(FILE fp, const pair<charconst,int> &val) {
245	fwrite(val.first, strlen(val.first), 1, fp);
246	}
247
248	static char* read_line(FILE* fp, char* line, int linesize) {
249	if ( fgets(line, linesize, fp) == NULL )
250	return NULL;
251	// normalize windows files :-(
252	const int linelen = strlen(line);
253	if ( linelen >= 2 && line[linelen-2] == '\r' && line[linelen-1] == '\n' ) {
254	line[linelen-2] = '\n';
255	line[linelen-1] = '\0';
256	}
257	return line;
258	}
259
260	static void read_item(FILE fp, charconst* val) {
261	char line[1024];
262	read_line(fp, line, sizeof(line));
263	char p = const_cast<char>(val);
264	*p = strdup(line);
265	}
266
267	static void read_item(FILE fp, pair<charconst,int> *val) {
268	char line[1024];
269	read_line(fp, line, sizeof(line));
270	char p = const_cast<char>(&val->first);
271	*p = strdup(line);
272	}
273
274	static void free_item(charconst val) {
275	free(*val);
276	}
277
278	static void free_item(pair<charconst,int> val) {
279	free(val->first);
280	}
281
282
283	// The read_write parameters specifies whether the read/write tests
284	// should be performed. Note that densehashtable::write_metdata is not
285	// implemented, so we only do the read/write tests for the
286	// sparsehashtable varieties.
287	template<class ht, class htint>
288	void test(bool read_write) {
289	htint y(1000);
290	htint z(64);
291	set_empty_key(&y, 0xefefef);
292	set_empty_key(&z, 0xefefef);
293
294	CHECK(y.empty());
295	insert(&y, 1);
296	CHECK(!y.empty());
297	insert(&y, 11);
298	insert(&y, 111);
299	insert(&y, 1111);
300	insert(&y, 11111);
301	insert(&y, 111111);
302	insert(&y, 1111111); // 1M, more or less
303	insert(&y, 11111111);
304	insert(&y, 111111111);
305	insert(&y, 1111111111); // 1B, more or less
306	for ( int i = 0; i < 32; ++i )
307	insert(&z, i);
308	// test the second half of the insert with an insert_iterator
309	insert_iterator<htint> insert_iter(z, z.begin());
310	for ( int i = 32; i < 64; ++i )
311	iterator_insert(&z, i, &insert_iter);
312
313	for ( typename htint::const_iterator it = y.begin(); it != y.end(); ++it )
314	LOGF << "y: " << *it << "\n";
315	z.insert(y.begin(), y.end());
316	swap(y,z);
317	for ( typename htint::iterator it = y.begin(); it != y.end(); ++it )
318	LOGF << "y+z: " << *it << "\n";
319	LOGF << "z has " << z.bucket_count() << " buckets\n";
320	LOGF << "y has " << y.bucket_count() << " buckets\n";
321	LOGF << "z size: " << z.size() << "\n";
322
323	for (int i = 0; i < 64; ++i)
324	CHECK(y.find(i) != y.end());
325
326	CHECK(z.size() == 10);
327	z.set_deleted_key(1010101010); // an unused value
328	z.erase(11111);
329	CHECK(z.size() == 9);
330	insert(&z, 11111); // should retake deleted value
331	CHECK(z.size() == 10);
332	// Do the delete/insert again. Last time we probably resized; this time no
333	z.erase(11111);
334	insert(&z, 11111); // should retake deleted value
335	CHECK(z.size() == 10);
336
337	z.erase(-11111); // shouldn't do anything
338	CHECK(z.size() == 10);
339	z.erase(1);
340	CHECK(z.size() == 9);
341	typename htint::iterator itdel = z.find(1111);
342	z.erase(itdel);
343	CHECK(z.size() == 8);
344	itdel = z.find(2222); // should be end()
345	z.erase(itdel); // shouldn't do anything
346	CHECK(z.size() == 8);
347	for ( typename htint::const_iterator it = z.begin(); it != z.end(); ++it )
348	LOGF << "y: " << *it << "\n";
349	z.set_deleted_key(1010101011); // a different unused value
350	for ( typename htint::const_iterator it = z.begin(); it != z.end(); ++it )
351	LOGF << "y: " << *it << "\n";
352	LOGF << "That's " << z.size() << " elements\n";
353	z.erase(z.begin(), z.end());
354	CHECK(z.empty());
355
356	y.clear();
357	CHECK(y.empty());
358	LOGF << "y has " << y.bucket_count() << " buckets\n";
359
360	ht w;
361	set_empty_key(&w, (char*) NULL);
362	insert(&w, const_cast<char **>(nwords),
363	const_cast<char *>(nwords) + sizeof(nwords) / sizeof(nwords));
364	LOGF << "w has " << w.size() << " items\n";
365	CHECK(w.size() == 2);
366	CHECK(w == w);
367
368	ht x;
369	set_empty_key(&x, (char*) NULL);
370	long dict_size = 1; // for size stats -- can't be 0 'cause of division
371
372	map<string, int> counts;
373	// Hash the dictionary
374	{
375	// automake says 'look for all data files in $srcdir.' OK.
376	string filestr = (string(getenv("srcdir") ? getenv("srcdir") : ".") +
377	"/src/words");
378	const char* file = filestr.c_str();
379	FILE *fp = fopen(file, "rb");
380	if ( fp == NULL ) {
381	LOGF << "Can't open " << file << " skipping dictionary hash...";
382	} else {
383	char line[1024];
384	while ( read_line(fp, line, sizeof(line)) ) {
385	insert(&x, strdup(line));
386	counts[line] = 0;
387	}
388	LOGF << "Read " << x.size() << " words from " << file << "\n";
389	fclose(fp);
390	struct stat buf;
391	stat(file, &buf);
392	dict_size = buf.st_size;
393	LOGF << "Size of " << file << ": " << buf.st_size << " bytes\n";
394	for ( char word = const_cast<char >(words);
395	word < const_cast<char *>(words) + sizeof(words) / sizeof(words);
396	++word ) {
397	if (x.find(*word) == x.end()) {
398	CHECK(w.find(*word) != w.end());
399	} else {
400	CHECK(w.find(*word) == w.end());
401	}
402	}
403	}
404	}
405	CHECK(counts.size() == x.size());
406
407	// Save the hashtable.
408	if (read_write) {
409	const char* file = "/tmp/#hashtable_unittest_dicthash";
410	FILE *fp = fopen(file, "wb");
411	if ( fp == NULL ) {
412	// maybe we can't write to /tmp/. Try the current directory
413	file = "#hashtable_unittest_dicthash";
414	fp = fopen(file, "wb");
415	}
416	if ( fp == NULL ) {
417	LOGF << "Can't open " << file << " skipping hashtable save...";
418	} else {
419	x.write_metadata(fp); // this only writes meta-information
420	int count = 0;
421	for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) {
422	write_item(fp, *it);
423	free_item(&(*it));
424	++count;
425	}
426	LOGF << "Wrote " << count << " words to " << file << "\n";
427	fclose(fp);
428	struct stat buf;
429	stat(file, &buf);
430	LOGF << "Size of " << file << ": " << buf.st_size << " bytes\n";
431	LOGF << STL_NAMESPACE::setprecision(3)
432	<< "Hashtable overhead "
433	<< (buf.st_size - dict_size) * 100.0 / dict_size
434	<< "% ("
435	<< (buf.st_size - dict_size) * 8.0 / count
436	<< " bits/entry)\n";
437	x.clear();
438
439	// Load the hashtable
440	fp = fopen(file, "rb");
441	if ( fp == NULL ) {
442	LOGF << "Can't open " << file << " skipping hashtable reload...";
443	} else {
444	x.read_metadata(fp); // reads metainformation
445	LOGF << "Hashtable size: " << x.size() << "\n";
446	int count = 0;
447	for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) {
448	read_item(fp, &(*it));
449	++count;
450	}
451	LOGF << "Read " << count << " words from " << file << "\n";
452	fclose(fp);
453	unlink(file);
454	for ( char word = const_cast<char >(words);
455	word < const_cast<char *>(words) + sizeof(words) / sizeof(words);
456	++word ) {
457	if (x.find(*word) == x.end()) {
458	CHECK(w.find(*word) != w.end());
459	} else {
460	CHECK(w.find(*word) == w.end());
461	}
462	}
463	}
464	}
465	}
466	for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) {
467	free_item(&(*it));
468	}
469	}
470
471	int main(int argc, char **argv) {
472	// First try with the low-level hashtable interface
473	LOGF << "\n\nTEST WITH DENSE_HASHTABLE\n\n";
474	test<dense_hashtable<char , char , HASH_NAMESPACE::hash<char *>,
475	Identity<char >, strcmp_fnc, allocator<char > >,
476	dense_hashtable<int, int, HASH_NAMESPACE::hash<int>,
477	Identity<int>, equal_to<int>, allocator<int> > >(
478	false);
479
480	// Now try with hash_set, which should be equivalent
481	LOGF << "\n\nTEST WITH DENSE_HASH_SET\n\n";
482	test<dense_hash_set<char , HASH_NAMESPACE::hash<char >, strcmp_fnc>,
483	dense_hash_set<int> >(false);
484
485	// Now try with hash_map, which differs only in insert()
486	LOGF << "\n\nTEST WITH DENSE_HASH_MAP\n\n";
487	test<dense_hash_map<char , int, HASH_NAMESPACE::hash<char >, strcmp_fnc>,
488	dense_hash_map<int, int> >(false);
489
490	// First try with the low-level hashtable interface
491	LOGF << "\n\nTEST WITH SPARSE_HASHTABLE\n\n";
492	test<sparse_hashtable<char , char , HASH_NAMESPACE::hash<char *>,
493	Identity<char >, strcmp_fnc, allocator<char > >,
494	sparse_hashtable<int, int, HASH_NAMESPACE::hash<int>,
495	Identity<int>, equal_to<int>, allocator<int> > >(
496	true);
497
498	// Now try with hash_set, which should be equivalent
499	LOGF << "\n\nTEST WITH SPARSE_HASH_SET\n\n";
500	test<sparse_hash_set<char , HASH_NAMESPACE::hash<char >, strcmp_fnc>,
501	sparse_hash_set<int> >(true);
502
503	// Now try with hash_map, which differs only in insert()
504	LOGF << "\n\nTEST WITH SPARSE_HASH_MAP\n\n";
505	test<sparse_hash_map<char , int, HASH_NAMESPACE::hash<char >, strcmp_fnc>,
506	sparse_hash_map<int, int> >(true);
507
508	LOGF << "\nAll tests pass.\n";
509	return 0;
510	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: GTP/trunk/Lib/Vis/Preprocessing/src/sparsehash/src/hashtable_unittest.cc @ 2162

Download in other formats: