- Timestamp:
- 02/21/10 13:06:04 (3 years ago)
- Location:
- lang/ruby/ruby-bayon/trunk
- Files:
-
- 17 modified
-
bayon/byvector.cc (modified) (5 diffs)
-
bayon/byvector.h (modified) (3 diffs)
-
bayon/classifier.cc (modified) (3 diffs)
-
bayon/classifier.h (modified) (4 diffs)
-
bayon/cluster.cc (modified) (9 diffs)
-
bayon/cluster.h (modified) (9 diffs)
-
bayon/util.cc (modified) (2 diffs)
-
bayon/util.h (modified) (6 diffs)
-
lib/bayon.rb (modified) (1 diff)
-
lib1.9/i386-mswin32/bayonext.so (modified) (previous)
-
sparsehash/google/dense_hash_map (modified) (4 diffs)
-
sparsehash/google/dense_hash_set (modified) (4 diffs)
-
sparsehash/google/sparse_hash_map (modified) (4 diffs)
-
sparsehash/google/sparse_hash_set (modified) (4 diffs)
-
sparsehash/google/sparsehash/densehashtable.h (modified) (18 diffs)
-
sparsehash/google/sparsehash/sparsehashtable.h (modified) (20 diffs)
-
sparsehash/google/sparsetable (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
lang/ruby/ruby-bayon/trunk/bayon/byvector.cc
r35907 r36827 1 // 1 // 2 2 // Utilities for vector operation 3 3 // 4 // Copyright(C) 2009 Mizuki Fujisawa < mfujisa@gmail.com>4 // Copyright(C) 2009 Mizuki Fujisawa <fujisawa@bayon.cc> 5 5 // 6 6 // This program is free software; you can redistribute it and/or modify … … 147 147 other = &vec1; 148 148 } 149 149 150 150 double prod = 0; 151 151 while (it != end) { … … 169 169 double prod = Vector::inner_product(vec1, vec2); 170 170 result = prod / (norm1 * norm2); 171 return std::isnan(result) ? 0.0 : result;171 return isnan(result) ? 0.0 : result; 172 172 } 173 173 } … … 178 178 double norm2 = vec2.norm(); 179 179 double prod = Vector::inner_product(vec1, vec2); 180 double denom = norm1 *norm2 - prod;180 double denom = norm1 + norm2 - prod; 181 181 double result = 0.0; 182 182 if (!denom) { … … 184 184 } else { 185 185 result = prod / denom; 186 return std::isnan(result) ? 0.0 : result;186 return isnan(result) ? 0.0 : result; 187 187 } 188 188 } -
lang/ruby/ruby-bayon/trunk/bayon/byvector.h
r35907 r36827 2 2 // Utility class for vector operation 3 3 // 4 // Copyright(C) 2009 Mizuki Fujisawa < mfujisa@gmail.com>4 // Copyright(C) 2009 Mizuki Fujisawa <fujisawa@bayon.cc> 5 5 // 6 6 // This program is free software; you can redistribute it and/or modify … … 232 232 /** 233 233 * Add other vector 234 * 234 * 235 235 * @param vec input vector 236 236 * @return void … … 240 240 /** 241 241 * Delete other vector 242 * 242 * 243 243 * @param vec input vector 244 244 * @return void -
lang/ruby/ruby-bayon/trunk/bayon/classifier.cc
r35907 r36827 2 2 // Classifier 3 3 // 4 // Copyright(C) 2009 Mizuki Fujisawa < mfujisa@gmail.com>4 // Copyright(C) 2009 Mizuki Fujisawa <fujisawa@bayon.cc> 5 5 // 6 6 // This program is free software; you can redistribute it and/or modify … … 16 16 // along with this program; if not, write to the Free Software 17 17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 // 18 // 19 19 20 20 #include "classifier.h" … … 81 81 /* Get list of id and points of similar vectors */ 82 82 void Classifier::similar_vectors( 83 size_t max, const Vector &vec, 83 size_t max, const Vector &vec, 84 84 std::vector<std::pair<VectorId, double> > &items) const { 85 85 -
lang/ruby/ruby-bayon/trunk/bayon/classifier.h
r35907 r36827 2 2 // Classifier 3 3 // 4 // Copyright(C) 2009 Mizuki Fujisawa < mfujisa@gmail.com>4 // Copyright(C) 2009 Mizuki Fujisawa <fujisawa@bayon.cc> 5 5 // 6 6 // This program is free software; you can redistribute it and/or modify … … 16 16 // along with this program; if not, write to the Free Software 17 17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 // 18 // 19 19 20 20 #ifndef BAYON_CLASSIFIER_H … … 85 85 */ 86 86 Classifier() { 87 init_hash_map(VECID_EMPTY_KEY, vectors_); 88 init_hash_map(VECID_EMPTY_KEY, inverted_index_); 87 init_hash_map(VECID_EMPTY_KEY, vectors_); 88 init_hash_map(VECID_EMPTY_KEY, inverted_index_); 89 89 } 90 90 … … 95 95 for (InvertedIndex::iterator it = inverted_index_.begin(); 96 96 it != inverted_index_.end(); ++it) { 97 if (it->second) delete it->second; 97 if (it->second) delete it->second; 98 98 } 99 99 } -
lang/ruby/ruby-bayon/trunk/bayon/cluster.cc
r35908 r36827 2 2 // Clustering API 3 3 // 4 // Copyright(C) 2009 Mizuki Fujisawa < mfujisa@gmail.com>4 // Copyright(C) 2009 Mizuki Fujisawa <fujisawa@bayon.cc> 5 5 // 6 6 // This program is free software; you can redistribute it and/or modify … … 21 21 22 22 namespace bayon { 23 23 24 24 /* Get sorted documents in clusters */ 25 25 void Cluster::sorted_documents( … … 58 58 if (siz < ndocs) ndocs = siz; 59 59 size_t index, count = 0; 60 60 61 61 index = myrand(&seed_) % siz; // initial center 62 62 docs.push_back(documents_[index]); … … 140 140 size_t Analyzer::repeated_bisection() { 141 141 Cluster *cluster = new Cluster(); 142 cluster->set_seed(seed_);142 cluster->set_seed(seed_); 143 143 for (size_t i = 0; i < documents_.size(); i++) { 144 144 cluster->add_document(documents_[i]); … … 175 175 sectioned[i]->set_sectioned_gain(); 176 176 if (sectioned[i]->sectioned_gain() < limit_eval_) { 177 sectioned[i]->composite_vector()->clear(); 177 for (size_t j = 0; j < sectioned[i]->sectioned_clusters().size(); j++) { 178 sectioned[i]->sectioned_clusters()[j]->clear(); 179 } 178 180 } 179 181 sectioned[i]->composite_vector()->clear(); … … 197 199 double *norms = static_cast<double *>(_alloca(sizeof(double) * clusters.size())); 198 200 #endif 201 199 202 for (size_t i = 0; i < clusters.size(); i++) { 200 203 norms[i] = clusters[i]->composite_vector()->norm(); … … 221 224 double value_base = refined_vector_value( 222 225 *clusters[cluster_id]->composite_vector(), *doc->feature(), -1); 223 double norm_base_moved = sqrt(pow(norms[cluster_id], 2) + value_base); 226 double norm_base_moved = pow(norms[cluster_id], 2) + value_base; 227 norm_base_moved = norm_base_moved > 0 ? sqrt(norm_base_moved) : 0.0; 224 228 225 229 double eval_max = -1.0; … … 230 234 double value_target = refined_vector_value( 231 235 *clusters[j]->composite_vector(), *doc->feature(), 1); 232 double norm_target_moved = sqrt(pow(norms[j], 2) + value_target); 236 double norm_target_moved = pow(norms[j], 2) + value_target; 237 norm_target_moved = norm_target_moved > 0 ? 238 sqrt(norm_target_moved) : 0.0; 233 239 double eval_moved = norm_base_moved + norm_target_moved 234 - (norms[cluster_id] + norms[j]);240 - norms[cluster_id] - norms[j]; 235 241 if (eval_max < eval_moved) { 236 242 eval_max = eval_moved; … … 285 291 count_df(df); 286 292 size_t ndocs = documents_.size(); 287 for (size_t i = 0; i < documents_.size(); i++) { 288 VecHashMap *hmap = documents_[i]->feature()->hash_map(); 289 for (VecHashMap::iterator it = hmap->begin(); it != hmap->end(); ++it) { 290 (*hmap)[it->first] = it->second * log((double)ndocs / (df[it->first] + 1)); 291 } 293 for (size_t i = 0; i < ndocs; i++) { 294 documents_[i]->idf(df, ndocs); 292 295 } 293 296 } -
lang/ruby/ruby-bayon/trunk/bayon/cluster.h
r35907 r36827 2 2 // Clustering API 3 3 // 4 // Copyright(C) 2009 Mizuki Fujisawa < mfujisa@gmail.com>4 // Copyright(C) 2009 Mizuki Fujisawa <fujisawa@bayon.cc> 5 5 // 6 6 // This program is free software; you can redistribute it and/or modify … … 16 16 // along with this program; if not, write to the Free Software 17 17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 // 18 // 19 19 20 20 #ifndef BAYON_CLUSTER_H … … 43 43 44 44 /******************************************************************** 45 * Typedef45 * Constants 46 46 *******************************************************************/ 47 47 const DocumentId DOC_EMPTY_KEY = -1; … … 143 143 } 144 144 145 void idf(const HashMap<VecKey, size_t>::type &df, size_t ndocs) { 146 VecHashMap *hmap = feature()->hash_map(); 147 HashMap<VecKey, size_t>::type::const_iterator dit; 148 for (VecHashMap::iterator it = hmap->begin(); it != hmap->end(); ++it) { 149 dit = df.find(it->first); 150 size_t denom = (dit != df.end()) ? dit->second : 1; 151 (*hmap)[it->first] = 152 it->second * log(static_cast<double>(ndocs) / denom); 153 } 154 } 145 155 }; 146 156 … … 185 195 */ 186 196 unsigned int seed_; 197 198 /** 199 * add the vectors of all documents to composite vector 200 */ 201 void set_composite_vector() { 202 composite_.clear(); 203 for (size_t i = 0; i < documents_.size(); i++) { 204 composite_.add_vector(*documents_[i]->feature()); 205 } 206 } 187 207 188 208 public: … … 241 261 Vector *centroid_vector() { 242 262 if (documents_.size() > 0 && composite_.size() == 0) { 243 for (size_t i = 0; i < documents_.size(); i++) { 244 composite_.add_vector(*documents_[i]->feature()); 245 } 246 } 247 composite_.copy(centroid_); 263 set_composite_vector(); 264 } 265 composite_vector()->copy(centroid_); 248 266 centroid_.normalize(); 249 267 return ¢roid_; … … 256 274 */ 257 275 Vector *composite_vector() { 258 return &composite_;259 }260 261 /**262 * Get composite Vector of the cluster263 *264 * @return const Vector * composite vector265 */266 const Vector *composite_vector() const {267 276 return &composite_; 268 277 } … … 301 310 302 311 /** 312 * Remove a document 313 * 314 * @param doc pointer of document object 315 * @return void 316 */ 317 void remove_document(const Document *doc) { 318 for (size_t i = 0; i < documents_.size(); i++) { 319 if (documents_[i]->id() == doc->id()) { 320 remove_document(i); 321 return; 322 } 323 } 324 } 325 326 /** 303 327 * Get sorted documents in clusters by similarity 304 328 * between documents and center (desc order) … … 318 342 std::vector<Document *> docs; 319 343 for (size_t i = 0; i < documents_.size(); i++) { 320 if ( !removed_[documents_[i]->id()]) {344 if (removed_.find(documents_[i]->id()) == removed_.end()) { 321 345 docs.push_back(documents_[i]); 322 346 } -
lang/ruby/ruby-bayon/trunk/bayon/util.cc
r35907 r36827 2 2 // Utility functions 3 3 // 4 // Copyright(C) 2009 Mizuki Fujisawa < mfujisa@gmail.com>4 // Copyright(C) 2009 Mizuki Fujisawa <fujisawa@bayon.cc> 5 5 // 6 6 // This program is free software; you can redistribute it and/or modify … … 16 16 // along with this program; if not, write to the Free Software 17 17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 // 18 // 19 19 20 20 #include "util.h" -
lang/ruby/ruby-bayon/trunk/bayon/util.h
r35907 r36827 2 2 // Utility functions 3 3 // 4 // Copyright(C) 2009 Mizuki Fujisawa < mfujisa@gmail.com>4 // Copyright(C) 2009 Mizuki Fujisawa <fujisawa@bayon.cc> 5 5 // 6 6 // This program is free software; you can redistribute it and/or modify … … 16 16 // along with this program; if not, write to the Free Software 17 17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 // 18 // 19 19 20 20 #ifndef BAYON_UTIL_H … … 25 25 #endif 26 26 27 #include <cmath> 27 28 #include <cstdlib> 28 29 #include <iostream> … … 46 47 #endif 47 48 48 /* isnan for win32 */ 49 /* isnan */ 50 #ifndef isnan 49 51 #ifdef _WIN32 50 52 #include <cfloat> 51 53 #define isnan(x) _isnan(x) 54 #else 55 #define isnan(x) (x != x) 56 #endif 52 57 #endif 53 58 54 59 55 60 /* hash function of string key for __gnu_cxx::hash_map */ 56 #if !defined(HAVE_GOOGLE_DENSE_HASH_MAP) && defined(HAVE_EXT_HASH_MAP)61 #if (defined(_WIN32) || !defined(HAVE_GOOGLE_DENSE_HASH_MAP)) && defined(HAVE_EXT_HASH_MAP) 57 62 namespace __gnu_cxx { 58 63 template<> struct hash<std::string> { 59 64 size_t operator() (const std::string &x) const { 60 65 return hash<const char *>()(x.c_str()); 61 } 66 } 62 67 }; 63 68 } … … 110 115 void init_hash_map(const KeyType &empty_key, HashType &hmap) { 111 116 #ifdef HAVE_GOOGLE_DENSE_HASH_MAP 112 hmap.max_load_factor(static_cast<float>(0.9));117 hmap.max_load_factor(static_cast<float>(0.9)); 113 118 hmap.set_empty_key(empty_key); 114 119 #endif … … 175 180 void set_seed(unsigned int seed) { seed_ = seed; } 176 181 unsigned int operator()(unsigned int max) { 177 double ratio = static_cast<double>(myrand(&seed_))178 / static_cast<double>(RAND_MAX);182 double ratio = static_cast<double>(myrand(&seed_)) 183 / static_cast<double>(RAND_MAX); 179 184 return static_cast<unsigned int>(ratio * max); 180 185 } -
lang/ruby/ruby-bayon/trunk/lib/bayon.rb
r34106 r36827 46 46 end 47 47 48 def delete_document(label) 49 @documents.delete_if {|l, f| l == label } 50 end 51 52 def [](label) 53 label, features = @documents.assoc(label) 54 return features 55 end 56 57 def labels 58 @documents.map {|l, f| l } 59 end 60 61 def clear 62 @documents.clear 63 end 64 48 65 def do_clustering(method = Analyzer::REPEATED_BISECTION) 66 return [] if @documents.empty? 67 49 68 analyzer = Analyzer.new 50 69 analyzer.set_cluster_size_limit(@cluster_size_limit) if @cluster_size_limit -
lang/ruby/ruby-bayon/trunk/sparsehash/google/dense_hash_map
r33943 r36827 46 46 // 47 47 // In other respects, we adhere mostly to the STL semantics for 48 // hash-map. One important exception is that insert() invalidates 49 // iterators entirely. On the plus side, though, erase() doesn't 50 // invalidate iterators at all, or even change the ordering of elements. 48 // hash-map. One important exception is that insert() may invalidate 49 // iterators entirely -- STL semantics are that insert() may reorder 50 // iterators, but they all still refer to something valid in the 51 // hashtable. Not so for us. Likewise, insert() may invalidate 52 // pointers into the hashtable. (Whether insert invalidates iterators 53 // and pointers depends on whether it results in a hashtable resize). 54 // On the plus side, delete() doesn't invalidate iterators or pointers 55 // at all, or even change the ordering of elements. 51 56 // 52 57 // Here are a few "power user" tips: … … 68 73 // the hash table will never shrink. 69 74 // 70 // Guide to what kind of hash_map to use:71 // (1) dense_hash_map: fastest, uses the most memory 75 // Roughly speaking: 76 // (1) dense_hash_map: fastest, uses the most memory unless entries are small 72 77 // (2) sparse_hash_map: slowest, uses the least memory 73 // (3) hash_map (STL): in the middle 78 // (3) hash_map / unordered_map (STL): in the middle 79 // 74 80 // Typically I use sparse_hash_map when I care about space and/or when 75 81 // I need to save the hashtable on disk. I use hash_map otherwise. I … … 77 83 // small sets with lots of lookups. 78 84 // 79 // - dense_hash_map has, typically, a factor of 2memory overhead (if your80 // data takes up X bytes, the hash_map uses X more bytes in overhead).81 // - sparse_hash_map has about 2bits overhead per entry.85 // - dense_hash_map has, typically, about 78% memory overhead (if your 86 // data takes up X bytes, the hash_map uses .78X more bytes in overhead). 87 // - sparse_hash_map has about 4 bits overhead per entry. 82 88 // - sparse_hash_map can be 3-7 times slower than the others for lookup and, 83 89 // especially, inserts. See time_hash_map.cc for details. … … 276 282 rep.set_empty_key(value_type(key, data_type())); // rep wants a value 277 283 } 278 void set_deleted_key(const key_type& key) { 279 rep.set_deleted_key(key); 280 } 284 key_type empty_key() const { 285 return rep.empty_key().first; // rep returns a value 286 } 287 288 void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); } 281 289 void clear_deleted_key() { rep.clear_deleted_key(); } 290 key_type deleted_key() const { return rep.deleted_key(); } 282 291 283 292 // These are standard -
lang/ruby/ruby-bayon/trunk/sparsehash/google/dense_hash_set
r33943 r36827 50 50 // 51 51 // In other respects, we adhere mostly to the STL semantics for 52 // hash-set. One important exception is that insert() invalidates 53 // iterators entirely. On the plus side, though, erase() doesn't 54 // invalidate iterators at all, or even change the ordering of elements. 52 // hash-map. One important exception is that insert() may invalidate 53 // iterators entirely -- STL semantics are that insert() may reorder 54 // iterators, but they all still refer to something valid in the 55 // hashtable. Not so for us. Likewise, insert() may invalidate 56 // pointers into the hashtable. (Whether insert invalidates iterators 57 // and pointers depends on whether it results in a hashtable resize). 58 // On the plus side, delete() doesn't invalidate iterators or pointers 59 // at all, or even change the ordering of elements. 55 60 // 56 61 // Here are a few "power user" tips: … … 72 77 // the hash table will never shrink. 73 78 // 74 // Guide to what kind of hash_set to use:75 // (1) dense_hash_set: fastest, uses the most memory 79 // Roughly speaking: 80 // (1) dense_hash_set: fastest, uses the most memory unless entries are small 76 81 // (2) sparse_hash_set: slowest, uses the least memory 77 // (3) hash_set (STL): in the middle 82 // (3) hash_set / unordered_set (STL): in the middle 83 // 78 84 // Typically I use sparse_hash_set when I care about space and/or when 79 85 // I need to save the hashtable on disk. I use hash_set otherwise. I … … 81 87 // small sets with lots of lookups. 82 88 // 83 // - dense_hash_set has, typically, a factor of 2memory overhead (if your84 // data takes up X bytes, the hash_set uses X more bytes in overhead).85 // - sparse_hash_set has about 2bits overhead per entry.86 // - sparse_hash_ mapcan be 3-7 times slower than the others for lookup and,89 // - dense_hash_set has, typically, about 78% memory overhead (if your 90 // data takes up X bytes, the hash_set uses .78X more bytes in overhead). 91 // - sparse_hash_set has about 4 bits overhead per entry. 92 // - sparse_hash_set can be 3-7 times slower than the others for lookup and, 87 93 // especially, inserts. See time_hash_map.cc for details. 88 94 // … … 256 262 // deleted key as time goes on, or get rid of it entirely to be insert-only. 257 263 void set_empty_key(const key_type& key) { rep.set_empty_key(key); } 264 key_type empty_key() const { return rep.empty_key(); } 265 258 266 void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); } 259 267 void clear_deleted_key() { rep.clear_deleted_key(); } 268 key_type deleted_key() const { return rep.deleted_key(); } 260 269 261 270 // These are standard -
lang/ruby/ruby-bayon/trunk/sparsehash/google/sparse_hash_map
r33943 r36827 38 38 // 39 39 // We adhere mostly to the STL semantics for hash-map. One important 40 // exception is that insert() invalidates iterators entirely. On the 41 // plus side, though, delete() doesn't invalidate iterators at all, or 42 // even change the ordering of elements. 40 // exception is that insert() may invalidate iterators entirely -- STL 41 // semantics are that insert() may reorder iterators, but they all 42 // still refer to something valid in the hashtable. Not so for us. 43 // Likewise, insert() may invalidate pointers into the hashtable. 44 // (Whether insert invalidates iterators and pointers depends on 45 // whether it results in a hashtable resize). On the plus side, 46 // delete() doesn't invalidate iterators or pointers at all, or even 47 // change the ordering of elements. 43 48 // 44 49 // Here are a few "power user" tips: … … 59 64 // the hash table will never shrink. 60 65 // 61 // Guide to what kind of hash_map to use:62 // (1) dense_hash_map: fastest, uses the most memory 66 // Roughly speaking: 67 // (1) dense_hash_map: fastest, uses the most memory unless entries are small 63 68 // (2) sparse_hash_map: slowest, uses the least memory 64 69 // (3) hash_map / unordered_map (STL): in the middle 70 // 65 71 // Typically I use sparse_hash_map when I care about space and/or when 66 72 // I need to save the hashtable on disk. I use hash_map otherwise. I … … 68 74 // small maps with lots of lookups. 69 75 // 70 // - dense_hash_map has, typically, a factor of 2memory overhead (if your71 // data takes up X bytes, the hash_map uses X more bytes in overhead).72 // - sparse_hash_map has about 2bits overhead per entry.76 // - dense_hash_map has, typically, about 78% memory overhead (if your 77 // data takes up X bytes, the hash_map uses .78X more bytes in overhead). 78 // - sparse_hash_map has about 4 bits overhead per entry. 73 79 // - sparse_hash_map can be 3-7 times slower than the others for lookup and, 74 80 // especially, inserts. See time_hash_map.cc for details. … … 264 270 } 265 271 void clear_deleted_key() { rep.clear_deleted_key(); } 272 key_type deleted_key() const { return rep.deleted_key(); } 266 273 267 274 // These are standard -
lang/ruby/ruby-bayon/trunk/sparsehash/google/sparse_hash_set
r33943 r36827 41 41 // change the key, and for sets there is no value). 42 42 // 43 // We adhere mostly to the STL semantics for hash-set. One important 44 // exception is that insert() invalidates iterators entirely. On the 45 // plus side, though, delete() doesn't invalidate iterators at all, or 46 // even change the ordering of elements. 43 // We adhere mostly to the STL semantics for hash-map. One important 44 // exception is that insert() may invalidate iterators entirely -- STL 45 // semantics are that insert() may reorder iterators, but they all 46 // still refer to something valid in the hashtable. Not so for us. 47 // Likewise, insert() may invalidate pointers into the hashtable. 48 // (Whether insert invalidates iterators and pointers depends on 49 // whether it results in a hashtable resize). On the plus side, 50 // delete() doesn't invalidate iterators or pointers at all, or even 51 // change the ordering of elements. 47 52 // 48 53 // Here are a few "power user" tips: … … 63 68 // the hash table will never shrink. 64 69 // 65 // Guide to what kind of hash_set to use:66 // (1) dense_hash_set: fastest, uses the most memory 70 // Roughly speaking: 71 // (1) dense_hash_set: fastest, uses the most memory unless entries are small 67 72 // (2) sparse_hash_set: slowest, uses the least memory 68 // (3) hash_set /unordered_set (STL): in the middle 73 // (3) hash_set / unordered_set (STL): in the middle 74 // 69 75 // Typically I use sparse_hash_set when I care about space and/or when 70 76 // I need to save the hashtable on disk. I use hash_set otherwise. I … … 72 78 // small sets with lots of lookups. 73 79 // 74 // - dense_hash_set has, typically, a factor of 2memory overhead (if your75 // data takes up X bytes, the hash_set uses X more bytes in overhead).76 // - sparse_hash_set has about 2bits overhead per entry.77 // - sparse_hash_ mapcan be 3-7 times slower than the others for lookup and,80 // - dense_hash_set has, typically, about 78% memory overhead (if your 81 // data takes up X bytes, the hash_set uses .78X more bytes in overhead). 82 // - sparse_hash_set has about 4 bits overhead per entry. 83 // - sparse_hash_set can be 3-7 times slower than the others for lookup and, 78 84 // especially, inserts. See time_hash_map.cc for details. 79 85 // … … 246 252 void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); } 247 253 void clear_deleted_key() { rep.clear_deleted_key(); } 254 key_type deleted_key() const { return rep.deleted_key(); } 248 255 249 256 // These are standard -
lang/ruby/ruby-bayon/trunk/sparsehash/google/sparsehash/densehashtable.h
r33943 r36827 103 103 #include <stdlib.h> // for abort() 104 104 #include <algorithm> // For swap(), eg 105 #include <stdexcept> // For length_error 105 106 #include <iostream> // For cerr 106 107 #include <memory> // For uninitialized_fill, uninitialized_copy … … 328 329 key_equal key_eq() const { return equals; } 329 330 331 // Accessor function for statistics gathering. 332 int num_table_copies() const { return num_ht_copies; } 333 330 334 private: 331 335 // Annoyingly, we can't copy values around, because they might have … … 358 362 } 359 363 364 bool test_deleted_key(const key_type& key) const { 365 // The num_deleted test is crucial for read(): after read(), the ht values 366 // are garbage, and we don't want to think some of them are deleted. 367 // Invariant: !use_deleted implies num_deleted is 0. 368 assert(use_deleted || num_deleted == 0); 369 return num_deleted > 0 && equals(delkey, key); 370 } 371 360 372 public: 361 373 void set_deleted_key(const key_type &key) { … … 372 384 use_deleted = false; 373 385 } 386 key_type deleted_key() const { 387 assert(use_deleted); 388 return delkey; 389 } 374 390 375 391 // These are public so the iterators can use them 376 392 // True if the item at position bucknum is "deleted" marker 377 393 bool test_deleted(size_type bucknum) const { 378 // The num_deleted test is crucial for read(): after read(), the ht values 379 // are garbage, and we don't want to think some of them are deleted. 380 return (use_deleted && num_deleted > 0 && 381 equals(delkey, get_key(table[bucknum]))); 394 return test_deleted_key(get_key(table[bucknum])); 382 395 } 383 396 bool test_deleted(const iterator &it) const { 384 return (use_deleted && num_deleted > 0 && 385 equals(delkey, get_key(*it))); 397 return test_deleted_key(get_key(*it)); 386 398 } 387 399 bool test_deleted(const const_iterator &it) const { 388 return (use_deleted && num_deleted > 0 && 389 equals(delkey, get_key(*it))); 390 } 391 // Set it so test_deleted is true. true if object didn't used to be deleted 392 // See below (at erase()) to explain why we allow const_iterators 400 return test_deleted_key(get_key(*it)); 401 } 402 403 // Set it so test_deleted is true. true if object didn't used to be deleted. 404 bool set_deleted(iterator &it) { 405 assert(use_deleted); // bad if set_deleted_key() wasn't called 406 bool retval = !test_deleted(it); 407 // &* converts from iterator to value-type. 408 set_key(&(*it), delkey); 409 return retval; 410 } 411 // Set it so test_deleted is false. true if object used to be deleted 412 bool clear_deleted(iterator &it) { 413 assert(use_deleted); // bad if set_deleted_key() wasn't called 414 // Happens automatically when we assign something else in its place. 415 return test_deleted(it); 416 } 417 418 // We also allow to set/clear the deleted bit on a const iterator. 419 // We allow a const_iterator for the same reason you can delete a 420 // const pointer: it's convenient, and semantically you can't use 421 // 'it' after it's been deleted anyway, so its const-ness doesn't 422 // really matter. 393 423 bool set_deleted(const_iterator &it) { 394 424 assert(use_deleted); // bad if set_deleted_key() wasn't called 395 425 bool retval = !test_deleted(it); 396 // &* converts from iterator to value-type397 426 set_key(const_cast<value_type*>(&(*it)), delkey); 398 427 return retval; … … 401 430 bool clear_deleted(const_iterator &it) { 402 431 assert(use_deleted); // bad if set_deleted_key() wasn't called 403 // happens automatically when we assign something else in its place404 432 return test_deleted(it); 405 433 } … … 460 488 assert(table); 461 489 fill_range_with_empty(table, table + num_buckets); 490 } 491 // TODO(sjackman): return a key_type rather than a value_type 492 value_type empty_key() const { 493 assert(use_empty); 494 return emptyval; 462 495 } 463 496 … … 488 521 size_type min_size(size_type num_elts, size_type min_buckets_wanted) { 489 522 size_type sz = HT_MIN_BUCKETS; // min buckets allowed 490 while ( sz < min_buckets_wanted || num_elts >= sz * enlarge_resize_percent ) 523 while ( sz < min_buckets_wanted || 524 num_elts >= static_cast<size_type>(sz * enlarge_resize_percent) ) { 525 if (sz * 2 < sz) 526 throw std::length_error("resize overflow"); // protect against overflow 491 527 sz *= 2; 528 } 492 529 return sz; 493 530 } … … 534 571 const size_type needed_size = min_size(num_elements + delta, 0); 535 572 if ( needed_size > bucket_count() ) { // we don't have enough buckets 536 const size_type resize_to = min_size(num_elements - num_deleted + delta, 537 0); 573 size_type resize_to = min_size(num_elements - num_deleted + delta, 574 bucket_count()); 575 if (resize_to < needed_size) { 576 // This situation means that we have enough deleted elements, 577 // that once we purge them, we won't actually have needed to 578 // grow. But we may want to grow anyway: if we just purge one 579 // element, say, we'll have to grow anyway next time we 580 // insert. Might as well grow now, since we're already going 581 // through the trouble of copying (in order to purge the 582 // deleted elements). 583 if (num_elements - num_deleted + delta >= 584 static_cast<size_type>(resize_to*2 * shrink_resize_percent)) { 585 // Good, we won't be below the shrink threshhold even if we double. 586 resize_to *= 2; 587 } 588 } 538 589 dense_hashtable tmp(*this, resize_to); 539 590 swap(tmp); // now we are tmp … … 600 651 num_elements++; 601 652 } 653 num_ht_copies++; 602 654 } 603 655 … … 648 700 ? HT_DEFAULT_STARTING_BUCKETS 649 701 : min_size(expected_max_items_in_table, 0)), 650 num_elements(0) {702 num_elements(0), num_ht_copies(0) { 651 703 // table is NULL until emptyval is set. However, we set num_buckets 652 704 // here so we know how much space to allocate once emptyval is set … … 664 716 enlarge_resize_percent(ht.enlarge_resize_percent), 665 717 shrink_resize_percent(ht.shrink_resize_percent), table(NULL), 666 num_buckets(0), num_elements(0) { 718 num_buckets(0), num_elements(0), num_ht_copies(ht.num_ht_copies) { 719 if (!ht.use_empty) { 720 // If use_empty isn't set, copy_from will crash, so we do our own copying. 721 assert(ht.empty()); 722 num_buckets = min_size(ht.size(), min_buckets_wanted); 723 reset_thresholds(); 724 return; 725 } 667 726 reset_thresholds(); 668 727 copy_from(ht, min_buckets_wanted); // copy_from() ignores deleted entries … … 671 730 dense_hashtable& operator= (const dense_hashtable& ht) { 672 731 if (&ht == this) return *this; // don't copy onto ourselves 673 clear(); 732 if (!ht.use_empty) { 733 assert(ht.empty()); 734 dense_hashtable empty_table(ht); // empty table with ht's thresholds 735 this->swap(empty_table); 736 return *this; 737 } 674 738 hash = ht.hash; 675 739 equals = ht.equals; … … 682 746 enlarge_resize_percent = ht.enlarge_resize_percent; 683 747 shrink_resize_percent = ht.shrink_resize_percent; 684 copy_from(ht, HT_MIN_BUCKETS); //sets num_deleted to 0 too748 copy_from(ht, HT_MIN_BUCKETS); // calls clear and sets num_deleted to 0 too 685 749 return *this; 686 750 } … … 713 777 STL_NAMESPACE::swap(num_buckets, ht.num_buckets); 714 778 STL_NAMESPACE::swap(num_elements, ht.num_elements); 779 STL_NAMESPACE::swap(num_ht_copies, ht.num_ht_copies); 715 780 reset_thresholds(); 716 781 ht.reset_thresholds(); … … 719 784 // It's always nice to be able to clear a table without deallocating it 720 785 void clear() { 786 const size_type new_num_buckets = min_size(0,0); 787 if (num_elements == 0 && 788 num_deleted == 0 && 789 new_num_buckets == num_buckets) { 790 // Table is already empty, and the number of buckets is already as we 791 // desire, so nothing to do. 792 return; 793 } 721 794 if (table) 722 795 destroy_buckets(0, num_buckets); 723 num_buckets = min_size(0,0); // our new size 724 reset_thresholds(); 725 table = (value_type *) realloc(table, num_buckets * sizeof(*table)); 796 if (!table || (new_num_buckets != num_buckets)) { 797 // Recompute the resize thresholds and realloc the table only if we're 798 // actually changing its size. 799 num_buckets = new_num_buckets; // our new size 800 reset_thresholds(); 801 table = (value_type *) realloc(table, num_buckets * sizeof(*table)); 802 } 726 803 assert(table); 727 804 fill_range_with_empty(table, table + num_buckets); … … 907 984 } 908 985 909 // This is really evil: really it should be iterator, not const_iterator. 910 // But...the only reason keys are const is to allow lookup. 911 // Since that's a moot issue for deleted keys, we allow const_iterators 986 // We return the iterator past the deleted item. 987 void erase(iterator pos) { 988 if ( pos == end() ) return; // sanity check 989 if ( set_deleted(pos) ) { // true if object has been newly deleted 990 ++num_deleted; 991 consider_shrink = true; // will think about shrink after next insert 992 } 993 } 994 995 void erase(iterator f, iterator l) { 996 for ( ; f != l; ++f) { 997 if ( set_deleted(f) ) // should always be true 998 ++num_deleted; 999 } 1000 consider_shrink = true; // will think about shrink after next insert 1001 } 1002 1003 // We allow you to erase a const_iterator just like we allow you to 1004 // erase an iterator. This is in parallel to 'delete': you can delete 1005 // a const pointer just like a non-const pointer. The logic is that 1006 // you can't use the object after it's erased anyway, so it doesn't matter 1007 // if it's const or not. 912 1008 void erase(const_iterator pos) { 913 1009 if ( pos == end() ) return; // sanity check … … 917 1013 } 918 1014 } 919 920 1015 void erase(const_iterator f, const_iterator l) { 921 1016 for ( ; f != l; ++f) { … … 1020 1115 size_type num_buckets; 1021 1116 size_type num_elements; 1117 int num_ht_copies; // a statistics counter incremented every Copy 1022 1118 bool consider_shrink; // true if we should try to shrink before next insert 1023 1119 -
lang/ruby/ruby-bayon/trunk/sparsehash/google/sparsehash/sparsehashtable.h
r33943 r36827 111 111 #include <assert.h> 112 112 #include <algorithm> // For swap(), eg 113 #include <stdexcept> // For length_error 113 114 #include <iterator> // for facts about iterator tags 114 115 #include <utility> // for pair<> … … 403 404 key_equal key_eq() const { return equals; } 404 405 406 // Accessor function for statistics gathering. 407 int num_table_copies() const { return num_ht_copies; } 408 405 409 private: 406 410 // We need to copy values when we set the special marker for deleted … … 447 451 use_deleted = false; 448 452 } 453 key_type deleted_key() const { 454 assert(use_deleted); 455 return delkey; 456 } 449 457 450 458 // These are public so the iterators can use them … … 468 476 equals(delkey, get_key(*it))); 469 477 } 470 // Set it so test_deleted is true. true if object didn't used to be deleted 471 // See below (at erase()) to explain why we allow const_iterators 478 // Set it so test_deleted is true. true if object didn't used to be deleted. 479 bool set_deleted(iterator &it) { 480 assert(use_deleted); 481 bool retval = !test_deleted(it); 482 // &* converts from iterator to value-type. 483 set_key(&(*it), delkey); 484 return retval; 485 } 486 // Set it so test_deleted is false. true if object used to be deleted. 487 bool clear_deleted(iterator &it) { 488 assert(use_deleted); 489 // Happens automatically when we assign something else in its place. 490 return test_deleted(it); 491 } 492 493 // We also allow to set/clear the deleted bit on a const iterator. 494 // We allow a const_iterator for the same reason you can delete a 495 // const pointer: it's convenient, and semantically you can't use 496 // 'it' after it's been deleted anyway, so its const-ness doesn't 497 // really matter. 472 498 bool set_deleted(const_iterator &it) { 473 499 assert(use_deleted); // bad if set_deleted_key() wasn't called 474 500 bool retval = !test_deleted(it); 475 // &* converts from iterator to value-type476 501 set_key(const_cast<value_type*>(&(*it)), delkey); 477 502 return retval; 478 503 } 479 // Set it so test_deleted is false. true if object used to be deleted480 504 bool clear_deleted(const_iterator &it) { 481 505 assert(use_deleted); // bad if set_deleted_key() wasn't called 482 // happens automatically when we assign something else in its place483 506 return test_deleted(it); 484 507 } … … 507 530 // If you like, you can give a min #buckets as well as a min #elts 508 531 size_type min_size(size_type num_elts, size_type min_buckets_wanted) { 509 size_type sz = HT_MIN_BUCKETS; 510 while ( sz < min_buckets_wanted || num_elts >= sz * enlarge_resize_percent ) 532 size_type sz = HT_MIN_BUCKETS; // min buckets allowed 533 while ( sz < min_buckets_wanted || 534 num_elts >= static_cast<size_type>(sz * enlarge_resize_percent) ) { 535 if (sz * 2 < sz) 536 throw std::length_error("resize overflow"); // protect against overflow 511 537 sz *= 2; 538 } 512 539 return sz; 513 540 } … … 524 551 // like "dense_hash_set<int> x; x.insert(4); x.erase(4);" will 525 552 // shrink us down to HT_MIN_BUCKETS buckets, which is too small. 526 if (shrink_threshold > 0 527 &&(table.num_nonempty()-num_deleted) < shrink_threshold &&528 bucket_count() > HT_DEFAULT_STARTING_BUCKETS ) {553 if (shrink_threshold > 0 && 554 (table.num_nonempty()-num_deleted) < shrink_threshold && 555 bucket_count() > HT_DEFAULT_STARTING_BUCKETS) { 529 556 size_type sz = bucket_count() / 2; // find how much we should shrink 530 557 while ( sz > HT_DEFAULT_STARTING_BUCKETS && … … 555 582 const size_type needed_size = min_size(table.num_nonempty() + delta, 0); 556 583 if ( needed_size > bucket_count() ) { // we don't have enough buckets 557 const size_type resize_to = min_size(table.num_nonempty() - num_deleted 558 + delta, 0); 584 size_type resize_to = min_size(table.num_nonempty() - num_deleted + delta, 585 bucket_count()); 586 if (resize_to < needed_size) { 587 // This situation means that we have enough deleted elements, 588 // that once we purge them, we won't actually have needed to 589 // grow. But we may want to grow anyway: if we just purge one 590 // element, say, we'll have to grow anyway next time we 591 // insert. Might as well grow now, since we're already going 592 // through the trouble of copying (in order to purge the 593 // deleted elements). 594 if (table.num_nonempty() - num_deleted + delta >= 595 static_cast<size_type>(resize_to*2 * shrink_resize_percent)) { 596 // Good, we won't be below the shrink threshhold even if we double. 597 resize_to *= 2; 598 } 599 } 600 559 601 sparse_hashtable tmp(MoveDontCopy, *this, resize_to); 560 602 swap(tmp); // now we are tmp … … 589 631 table.set(bucknum, *it); // copies the value to here 590 632 } 633 num_ht_copies++; 591 634 } 592 635 … … 626 669 table.set(bucknum, *it); // copies the value to here 627 670 } 671 num_ht_copies++; 628 672 } 629 673 … … 673 717 table(expected_max_items_in_table == 0 674 718 ? HT_DEFAULT_STARTING_BUCKETS 675 : min_size(expected_max_items_in_table, 0)) { 719 : min_size(expected_max_items_in_table, 0)), 720 num_ht_copies(0) { 676 721 reset_thresholds(); 677 722 } … … 688 733 enlarge_resize_percent(ht.enlarge_resize_percent), 689 734 shrink_resize_percent(ht.shrink_resize_percent), 690 table() {735 table(), num_ht_copies(ht.num_ht_copies) { 691 736 reset_thresholds(); 692 737 copy_from(ht, min_buckets_wanted); // copy_from() ignores deleted entries … … 698 743 enlarge_resize_percent(ht.enlarge_resize_percent), 699 744 shrink_resize_percent(ht.shrink_resize_percent), 700 table() {745 table(), num_ht_copies(ht.num_ht_copies) { 701 746 reset_thresholds(); 702 747 move_from(mover, ht, min_buckets_wanted); // ignores deleted entries … … 705 750 sparse_hashtable& operator= (const sparse_hashtable& ht) { 706 751 if (&ht == this) return *this; // don't copy onto ourselves 707 clear();708 752 hash = ht.hash; 709 753 equals = ht.equals; … … 712 756 use_deleted = ht.use_deleted; 713 757 delkey = ht.delkey; 714 copy_from(ht, HT_MIN_BUCKETS); //sets num_deleted to 0 too758 copy_from(ht, HT_MIN_BUCKETS); // calls clear and sets num_deleted to 0 too 715 759 return *this; 716 760 } … … 728 772 STL_NAMESPACE::swap(delkey, ht.delkey); 729 773 table.swap(ht.table); 774 STL_NAMESPACE::swap(num_ht_copies, ht.num_ht_copies); 730 775 reset_thresholds(); 731 776 ht.reset_thresholds(); … … 734 779 // It's always nice to be able to clear a table without deallocating it 735 780 void clear() { 736 table.clear(); 781 if (!empty() || (num_deleted != 0)) { 782 table.clear(); 783 } 737 784 reset_thresholds(); 738 785 num_deleted = 0; … … 888 935 // DELETION ROUTINES 889 936 size_type erase(const key_type& key) { 890 // First, double-check we're not erasing delkey 937 // First, double-check we're not erasing delkey. 891 938 assert(!use_deleted || !equals(key, delkey)); 892 939 const_iterator pos = find(key); // shrug: shouldn't need to be const … … 902 949 } 903 950 904 // This is really evil: really it should be iterator, not const_iterator. 905 // But...the only reason keys are const is to allow lookup. 906 // Since that's a moot issue for deleted keys, we allow const_iterators 951 // We return the iterator past the deleted item. 952 void erase(iterator pos) { 953 if ( pos == end() ) return; // sanity check 954 if ( set_deleted(pos) ) { // true if object has been newly deleted 955 ++num_deleted; 956 consider_shrink = true; // will think about shrink after next insert 957 } 958 } 959 960 void erase(iterator f, iterator l) { 961 for ( ; f != l; ++f) { 962 if ( set_deleted(f) ) // should always be true 963 ++num_deleted; 964 } 965 consider_shrink = true; // will think about shrink after next insert 966 } 967 968 // We allow you to erase a const_iterator just like we allow you to 969 // erase an iterator. This is in parallel to 'delete': you can delete 970 // a const pointer just like a non-const pointer. The logic is that 971 // you can't use the object after it's erased anyway, so it doesn't matter 972 // if it's const or not. 907 973 void erase(const_iterator pos) { 908 974 if ( pos == end() ) return; // sanity check … … 912 978 } 913 979 } 914 915 980 void erase(const_iterator f, const_iterator l) { 916 981 for ( ; f != l; ++f) { … … 977 1042 sparsetable<value_type> table; // holds num_buckets and num_elements too 978 1043 bool consider_shrink; // true if we should try to shrink before next insert 1044 int num_ht_copies; // a statistics counter incremented every Copy/Move 979 1045 980 1046 void reset_thresholds() { -
lang/ruby/ruby-bayon/trunk/sparsehash/google/sparsetable
r33943 r36827 739 739 // We need a small function that tells us how many set bits there are 740 740 // in positions 0..i-1 of the bitmap. It uses a big table. 741 // We make it static so templates don't allocate lots of these tables 741 // We make it static so templates don't allocate lots of these tables. 742 // There are lots of ways to do this calculation (called 'popcount'). 743 // The 8-bit table lookup is one of the fastest, though this 744 // implementation suffers from not doing any loop unrolling. See, eg, 745 // http://www.dalkescientific.com/writings/diary/archive/2008/07/03/hakmem_and_other_popcounts.html 746 // http://gurmeetsingh.wordpress.com/2008/08/05/fast-bit-counting-routines/ 742 747 static size_type pos_to_offset(const unsigned char *bm, size_type pos) { 743 748 // We could make these ints. The tradeoff is size (eg does it overwhelm
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)