Changeset 36827 for lang

Show
Ignore:
Timestamp:
02/21/10 13:06:04 (3 years ago)
Author:
winebarrel
Message:
 
Location:
lang/ruby/ruby-bayon/trunk
Files:
17 modified

Legend:

Unmodified
Added
Removed
  • lang/ruby/ruby-bayon/trunk/bayon/byvector.cc

    r35907 r36827  
    1 //  
     1// 
    22// Utilities for vector operation 
    33// 
    4 // Copyright(C) 2009  Mizuki Fujisawa <mfujisa@gmail.com> 
     4// Copyright(C) 2009  Mizuki Fujisawa <fujisawa@bayon.cc> 
    55// 
    66// This program is free software; you can redistribute it and/or modify 
     
    147147    other = &vec1; 
    148148  } 
    149    
     149 
    150150  double prod = 0; 
    151151  while (it != end) { 
     
    169169    double prod = Vector::inner_product(vec1, vec2); 
    170170    result = prod / (norm1 * norm2); 
    171     return std::isnan(result) ? 0.0 : result; 
     171    return isnan(result) ? 0.0 : result; 
    172172  } 
    173173} 
     
    178178  double norm2 = vec2.norm(); 
    179179  double prod = Vector::inner_product(vec1, vec2); 
    180   double denom = norm1 * norm2 - prod; 
     180  double denom = norm1 + norm2 - prod; 
    181181  double result = 0.0; 
    182182  if (!denom) { 
     
    184184  } else { 
    185185    result = prod / denom; 
    186     return std::isnan(result) ? 0.0 : result; 
     186    return isnan(result) ? 0.0 : result; 
    187187  } 
    188188} 
  • lang/ruby/ruby-bayon/trunk/bayon/byvector.h

    r35907 r36827  
    22// Utility class for vector operation 
    33// 
    4 // Copyright(C) 2009  Mizuki Fujisawa <mfujisa@gmail.com> 
     4// Copyright(C) 2009  Mizuki Fujisawa <fujisawa@bayon.cc> 
    55// 
    66// This program is free software; you can redistribute it and/or modify 
     
    232232  /** 
    233233   * Add other vector 
    234    *  
     234   * 
    235235   * @param vec input vector 
    236236   * @return void 
     
    240240  /** 
    241241   * Delete other vector 
    242    *  
     242   * 
    243243   * @param vec input vector 
    244244   * @return void 
  • lang/ruby/ruby-bayon/trunk/bayon/classifier.cc

    r35907 r36827  
    22// Classifier 
    33// 
    4 // Copyright(C) 2009  Mizuki Fujisawa <mfujisa@gmail.com> 
     4// Copyright(C) 2009  Mizuki Fujisawa <fujisawa@bayon.cc> 
    55// 
    66// This program is free software; you can redistribute it and/or modify 
     
    1616// along with this program; if not, write to the Free Software 
    1717// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
    18 //  
     18// 
    1919 
    2020#include "classifier.h" 
     
    8181/* Get list of id and points of similar vectors */ 
    8282void Classifier::similar_vectors( 
    83   size_t max, const Vector &vec,  
     83  size_t max, const Vector &vec, 
    8484  std::vector<std::pair<VectorId, double> > &items) const { 
    8585 
  • lang/ruby/ruby-bayon/trunk/bayon/classifier.h

    r35907 r36827  
    22// Classifier 
    33// 
    4 // Copyright(C) 2009  Mizuki Fujisawa <mfujisa@gmail.com> 
     4// Copyright(C) 2009  Mizuki Fujisawa <fujisawa@bayon.cc> 
    55// 
    66// This program is free software; you can redistribute it and/or modify 
     
    1616// along with this program; if not, write to the Free Software 
    1717// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
    18 //  
     18// 
    1919 
    2020#ifndef BAYON_CLASSIFIER_H 
     
    8585   */ 
    8686  Classifier() { 
    87     init_hash_map(VECID_EMPTY_KEY, vectors_);   
    88     init_hash_map(VECID_EMPTY_KEY, inverted_index_);   
     87    init_hash_map(VECID_EMPTY_KEY, vectors_); 
     88    init_hash_map(VECID_EMPTY_KEY, inverted_index_); 
    8989  } 
    9090 
     
    9595    for (InvertedIndex::iterator it = inverted_index_.begin(); 
    9696         it != inverted_index_.end(); ++it) { 
    97       if (it->second) delete it->second;     
     97      if (it->second) delete it->second; 
    9898    } 
    9999  } 
  • lang/ruby/ruby-bayon/trunk/bayon/cluster.cc

    r35908 r36827  
    22// Clustering API 
    33// 
    4 // Copyright(C) 2009  Mizuki Fujisawa <mfujisa@gmail.com> 
     4// Copyright(C) 2009  Mizuki Fujisawa <fujisawa@bayon.cc> 
    55// 
    66// This program is free software; you can redistribute it and/or modify 
     
    2121 
    2222namespace bayon { 
    23    
     23 
    2424/* Get sorted documents in clusters */ 
    2525void Cluster::sorted_documents( 
     
    5858  if (siz < ndocs) ndocs = siz; 
    5959  size_t index, count = 0; 
    60    
     60 
    6161  index = myrand(&seed_) % siz; // initial center 
    6262  docs.push_back(documents_[index]); 
     
    140140size_t Analyzer::repeated_bisection() { 
    141141  Cluster *cluster = new Cluster(); 
    142     cluster->set_seed(seed_); 
     142  cluster->set_seed(seed_); 
    143143  for (size_t i = 0; i < documents_.size(); i++) { 
    144144    cluster->add_document(documents_[i]); 
     
    175175      sectioned[i]->set_sectioned_gain(); 
    176176      if (sectioned[i]->sectioned_gain() < limit_eval_) { 
    177         sectioned[i]->composite_vector()->clear(); 
     177        for (size_t j = 0; j < sectioned[i]->sectioned_clusters().size(); j++) { 
     178          sectioned[i]->sectioned_clusters()[j]->clear(); 
     179        } 
    178180      } 
    179181      sectioned[i]->composite_vector()->clear(); 
     
    197199  double *norms = static_cast<double *>(_alloca(sizeof(double) * clusters.size())); 
    198200#endif 
     201 
    199202  for (size_t i = 0; i < clusters.size(); i++) { 
    200203    norms[i] = clusters[i]->composite_vector()->norm(); 
     
    221224      double value_base = refined_vector_value( 
    222225        *clusters[cluster_id]->composite_vector(), *doc->feature(), -1); 
    223       double norm_base_moved = sqrt(pow(norms[cluster_id], 2) + value_base); 
     226      double norm_base_moved = pow(norms[cluster_id], 2) + value_base; 
     227      norm_base_moved = norm_base_moved > 0 ? sqrt(norm_base_moved) : 0.0; 
    224228 
    225229      double eval_max = -1.0; 
     
    230234        double value_target = refined_vector_value( 
    231235          *clusters[j]->composite_vector(), *doc->feature(), 1); 
    232         double norm_target_moved = sqrt(pow(norms[j], 2) + value_target); 
     236        double norm_target_moved = pow(norms[j], 2) + value_target; 
     237        norm_target_moved = norm_target_moved > 0 ? 
     238          sqrt(norm_target_moved) : 0.0; 
    233239        double eval_moved = norm_base_moved + norm_target_moved 
    234                             - (norms[cluster_id] + norms[j]); 
     240                            - norms[cluster_id] - norms[j]; 
    235241        if (eval_max < eval_moved) { 
    236242          eval_max = eval_moved; 
     
    285291  count_df(df); 
    286292  size_t ndocs = documents_.size(); 
    287   for (size_t i = 0; i < documents_.size(); i++) { 
    288     VecHashMap *hmap = documents_[i]->feature()->hash_map(); 
    289     for (VecHashMap::iterator it = hmap->begin(); it != hmap->end(); ++it) { 
    290       (*hmap)[it->first] = it->second * log((double)ndocs / (df[it->first] + 1)); 
    291     } 
     293  for (size_t i = 0; i < ndocs; i++) { 
     294    documents_[i]->idf(df, ndocs); 
    292295  } 
    293296} 
  • lang/ruby/ruby-bayon/trunk/bayon/cluster.h

    r35907 r36827  
    22// Clustering API 
    33// 
    4 // Copyright(C) 2009  Mizuki Fujisawa <mfujisa@gmail.com> 
     4// Copyright(C) 2009  Mizuki Fujisawa <fujisawa@bayon.cc> 
    55// 
    66// This program is free software; you can redistribute it and/or modify 
     
    1616// along with this program; if not, write to the Free Software 
    1717// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
    18 //  
     18// 
    1919 
    2020#ifndef BAYON_CLUSTER_H 
     
    4343 
    4444/******************************************************************** 
    45  * Typedef 
     45 * Constants 
    4646 *******************************************************************/ 
    4747const DocumentId DOC_EMPTY_KEY   = -1; 
     
    143143  } 
    144144 
     145  void idf(const HashMap<VecKey, size_t>::type &df, size_t ndocs) { 
     146    VecHashMap *hmap = feature()->hash_map(); 
     147    HashMap<VecKey, size_t>::type::const_iterator dit; 
     148    for (VecHashMap::iterator it = hmap->begin(); it != hmap->end(); ++it) { 
     149      dit = df.find(it->first); 
     150      size_t denom = (dit != df.end()) ? dit->second : 1; 
     151      (*hmap)[it->first] = 
     152        it->second * log(static_cast<double>(ndocs) / denom); 
     153    } 
     154  } 
    145155}; 
    146156 
     
    185195   */ 
    186196  unsigned int seed_; 
     197 
     198  /** 
     199   * add the vectors of all documents to composite vector 
     200   */ 
     201  void set_composite_vector() { 
     202    composite_.clear(); 
     203    for (size_t i = 0; i < documents_.size(); i++) { 
     204      composite_.add_vector(*documents_[i]->feature()); 
     205    } 
     206  } 
    187207 
    188208 public: 
     
    241261  Vector *centroid_vector() { 
    242262    if (documents_.size() > 0 && composite_.size() == 0) { 
    243       for (size_t i = 0; i < documents_.size(); i++) { 
    244         composite_.add_vector(*documents_[i]->feature()); 
    245       } 
    246     } 
    247     composite_.copy(centroid_); 
     263      set_composite_vector(); 
     264    } 
     265    composite_vector()->copy(centroid_); 
    248266    centroid_.normalize(); 
    249267    return &centroid_; 
     
    256274   */ 
    257275  Vector *composite_vector() { 
    258     return &composite_; 
    259   } 
    260  
    261   /** 
    262    * Get composite Vector of the cluster 
    263    * 
    264    * @return const Vector * composite vector 
    265    */ 
    266   const Vector *composite_vector() const { 
    267276    return &composite_; 
    268277  } 
     
    301310 
    302311  /** 
     312   * Remove a document 
     313   * 
     314   * @param doc  pointer of document object 
     315   * @return void 
     316   */ 
     317  void remove_document(const Document *doc) { 
     318    for (size_t i = 0; i < documents_.size(); i++) { 
     319      if (documents_[i]->id() == doc->id()) { 
     320        remove_document(i); 
     321        return; 
     322      } 
     323    } 
     324  } 
     325 
     326  /** 
    303327   * Get sorted documents in clusters by similarity 
    304328   * between documents and center (desc order) 
     
    318342      std::vector<Document *> docs; 
    319343      for (size_t i = 0; i < documents_.size(); i++) { 
    320         if (!removed_[documents_[i]->id()]) { 
     344        if (removed_.find(documents_[i]->id()) == removed_.end()) { 
    321345          docs.push_back(documents_[i]); 
    322346        } 
  • lang/ruby/ruby-bayon/trunk/bayon/util.cc

    r35907 r36827  
    22// Utility functions 
    33// 
    4 // Copyright(C) 2009  Mizuki Fujisawa <mfujisa@gmail.com> 
     4// Copyright(C) 2009  Mizuki Fujisawa <fujisawa@bayon.cc> 
    55// 
    66// This program is free software; you can redistribute it and/or modify 
     
    1616// along with this program; if not, write to the Free Software 
    1717// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
    18 //  
     18// 
    1919 
    2020#include "util.h" 
  • lang/ruby/ruby-bayon/trunk/bayon/util.h

    r35907 r36827  
    22// Utility functions 
    33// 
    4 // Copyright(C) 2009  Mizuki Fujisawa <mfujisa@gmail.com> 
     4// Copyright(C) 2009  Mizuki Fujisawa <fujisawa@bayon.cc> 
    55// 
    66// This program is free software; you can redistribute it and/or modify 
     
    1616// along with this program; if not, write to the Free Software 
    1717// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
    18 //  
     18// 
    1919 
    2020#ifndef BAYON_UTIL_H 
     
    2525#endif 
    2626 
     27#include <cmath> 
    2728#include <cstdlib> 
    2829#include <iostream> 
     
    4647#endif 
    4748 
    48 /* isnan for win32 */ 
     49/* isnan */ 
     50#ifndef isnan 
    4951#ifdef _WIN32 
    5052#include <cfloat> 
    5153#define isnan(x) _isnan(x) 
     54#else 
     55#define isnan(x) (x != x) 
     56#endif 
    5257#endif 
    5358 
    5459 
    5560/* hash function of string key for __gnu_cxx::hash_map */ 
    56 #if !defined(HAVE_GOOGLE_DENSE_HASH_MAP) && defined(HAVE_EXT_HASH_MAP) 
     61#if (defined(_WIN32) || !defined(HAVE_GOOGLE_DENSE_HASH_MAP)) && defined(HAVE_EXT_HASH_MAP) 
    5762namespace __gnu_cxx { 
    5863  template<> struct hash<std::string> { 
    5964    size_t operator() (const std::string &x) const { 
    6065      return hash<const char *>()(x.c_str()); 
    61     }    
     66    } 
    6267  }; 
    6368} 
     
    110115void init_hash_map(const KeyType &empty_key, HashType &hmap) { 
    111116#ifdef HAVE_GOOGLE_DENSE_HASH_MAP 
    112   hmap.max_load_factor(static_cast<float>(0.9)); 
     117   hmap.max_load_factor(static_cast<float>(0.9)); 
    113118  hmap.set_empty_key(empty_key); 
    114119#endif 
     
    175180  void set_seed(unsigned int seed) { seed_ = seed; } 
    176181  unsigned int operator()(unsigned int max) { 
    177     double ratio =  static_cast<double>(myrand(&seed_)) 
    178                     / static_cast<double>(RAND_MAX); 
     182    double ratio = static_cast<double>(myrand(&seed_)) 
     183                   / static_cast<double>(RAND_MAX); 
    179184    return static_cast<unsigned int>(ratio * max); 
    180185  } 
  • lang/ruby/ruby-bayon/trunk/lib/bayon.rb

    r34106 r36827  
    4646    end 
    4747 
     48    def delete_document(label) 
     49      @documents.delete_if {|l, f| l == label } 
     50    end 
     51 
     52    def [](label) 
     53      label, features = @documents.assoc(label) 
     54      return features 
     55    end 
     56 
     57    def labels 
     58      @documents.map {|l, f| l } 
     59    end 
     60 
     61    def clear 
     62      @documents.clear 
     63    end 
     64 
    4865    def do_clustering(method = Analyzer::REPEATED_BISECTION) 
     66      return [] if @documents.empty? 
     67 
    4968      analyzer = Analyzer.new 
    5069      analyzer.set_cluster_size_limit(@cluster_size_limit) if @cluster_size_limit 
  • lang/ruby/ruby-bayon/trunk/sparsehash/google/dense_hash_map

    r33943 r36827  
    4646// 
    4747// In other respects, we adhere mostly to the STL semantics for 
    48 // hash-map.  One important exception is that insert() invalidates 
    49 // iterators entirely.  On the plus side, though, erase() doesn't 
    50 // invalidate iterators at all, or even change the ordering of elements. 
     48// hash-map.  One important exception is that insert() may invalidate 
     49// iterators entirely -- STL semantics are that insert() may reorder 
     50// iterators, but they all still refer to something valid in the 
     51// hashtable.  Not so for us.  Likewise, insert() may invalidate 
     52// pointers into the hashtable.  (Whether insert invalidates iterators 
     53// and pointers depends on whether it results in a hashtable resize). 
     54// On the plus side, delete() doesn't invalidate iterators or pointers 
     55// at all, or even change the ordering of elements. 
    5156// 
    5257// Here are a few "power user" tips: 
     
    6873//         the hash table will never shrink. 
    6974// 
    70 // Guide to what kind of hash_map to use: 
    71 //   (1) dense_hash_map: fastest, uses the most memory 
     75// Roughly speaking: 
     76//   (1) dense_hash_map: fastest, uses the most memory unless entries are small 
    7277//   (2) sparse_hash_map: slowest, uses the least memory 
    73 //   (3) hash_map (STL): in the middle 
     78//   (3) hash_map / unordered_map (STL): in the middle 
     79// 
    7480// Typically I use sparse_hash_map when I care about space and/or when 
    7581// I need to save the hashtable on disk.  I use hash_map otherwise.  I 
     
    7783// small sets with lots of lookups. 
    7884// 
    79 // - dense_hash_map has, typically, a factor of 2 memory overhead (if your 
    80 //   data takes up X bytes, the hash_map uses X more bytes in overhead). 
    81 // - sparse_hash_map has about 2 bits overhead per entry. 
     85// - dense_hash_map has, typically, about 78% memory overhead (if your 
     86//   data takes up X bytes, the hash_map uses .78X more bytes in overhead). 
     87// - sparse_hash_map has about 4 bits overhead per entry. 
    8288// - sparse_hash_map can be 3-7 times slower than the others for lookup and, 
    8389//   especially, inserts.  See time_hash_map.cc for details. 
     
    276282    rep.set_empty_key(value_type(key, data_type()));    // rep wants a value 
    277283  } 
    278   void set_deleted_key(const key_type& key)   { 
    279     rep.set_deleted_key(key); 
    280   } 
     284  key_type empty_key() const { 
     285    return rep.empty_key().first;                       // rep returns a value 
     286  } 
     287 
     288  void set_deleted_key(const key_type& key)   { rep.set_deleted_key(key); } 
    281289  void clear_deleted_key()                    { rep.clear_deleted_key(); } 
     290  key_type deleted_key() const                { return rep.deleted_key(); } 
    282291 
    283292  // These are standard 
  • lang/ruby/ruby-bayon/trunk/sparsehash/google/dense_hash_set

    r33943 r36827  
    5050// 
    5151// In other respects, we adhere mostly to the STL semantics for 
    52 // hash-set.  One important exception is that insert() invalidates 
    53 // iterators entirely.  On the plus side, though, erase() doesn't 
    54 // invalidate iterators at all, or even change the ordering of elements. 
     52// hash-map.  One important exception is that insert() may invalidate 
     53// iterators entirely -- STL semantics are that insert() may reorder 
     54// iterators, but they all still refer to something valid in the 
     55// hashtable.  Not so for us.  Likewise, insert() may invalidate 
     56// pointers into the hashtable.  (Whether insert invalidates iterators 
     57// and pointers depends on whether it results in a hashtable resize). 
     58// On the plus side, delete() doesn't invalidate iterators or pointers 
     59// at all, or even change the ordering of elements. 
    5560// 
    5661// Here are a few "power user" tips: 
     
    7277//         the hash table will never shrink. 
    7378// 
    74 // Guide to what kind of hash_set to use: 
    75 //   (1) dense_hash_set: fastest, uses the most memory 
     79// Roughly speaking: 
     80//   (1) dense_hash_set: fastest, uses the most memory unless entries are small 
    7681//   (2) sparse_hash_set: slowest, uses the least memory 
    77 //   (3) hash_set (STL): in the middle 
     82//   (3) hash_set / unordered_set (STL): in the middle 
     83// 
    7884// Typically I use sparse_hash_set when I care about space and/or when 
    7985// I need to save the hashtable on disk.  I use hash_set otherwise.  I 
     
    8187// small sets with lots of lookups. 
    8288// 
    83 // - dense_hash_set has, typically, a factor of 2 memory overhead (if your 
    84 //   data takes up X bytes, the hash_set uses X more bytes in overhead). 
    85 // - sparse_hash_set has about 2 bits overhead per entry. 
    86 // - sparse_hash_map can be 3-7 times slower than the others for lookup and, 
     89// - dense_hash_set has, typically, about 78% memory overhead (if your 
     90//   data takes up X bytes, the hash_set uses .78X more bytes in overhead). 
     91// - sparse_hash_set has about 4 bits overhead per entry. 
     92// - sparse_hash_set can be 3-7 times slower than the others for lookup and, 
    8793//   especially, inserts.  See time_hash_map.cc for details. 
    8894// 
     
    256262  // deleted key as time goes on, or get rid of it entirely to be insert-only. 
    257263  void set_empty_key(const key_type& key)     { rep.set_empty_key(key); } 
     264  key_type empty_key() const                  { return rep.empty_key(); } 
     265 
    258266  void set_deleted_key(const key_type& key)   { rep.set_deleted_key(key); } 
    259267  void clear_deleted_key()                    { rep.clear_deleted_key(); } 
     268  key_type deleted_key() const                { return rep.deleted_key(); } 
    260269 
    261270  // These are standard 
  • lang/ruby/ruby-bayon/trunk/sparsehash/google/sparse_hash_map

    r33943 r36827  
    3838// 
    3939// We adhere mostly to the STL semantics for hash-map.  One important 
    40 // exception is that insert() invalidates iterators entirely.  On the 
    41 // plus side, though, delete() doesn't invalidate iterators at all, or 
    42 // even change the ordering of elements. 
     40// exception is that insert() may invalidate iterators entirely -- STL 
     41// semantics are that insert() may reorder iterators, but they all 
     42// still refer to something valid in the hashtable.  Not so for us. 
     43// Likewise, insert() may invalidate pointers into the hashtable. 
     44// (Whether insert invalidates iterators and pointers depends on 
     45// whether it results in a hashtable resize).  On the plus side, 
     46// delete() doesn't invalidate iterators or pointers at all, or even 
     47// change the ordering of elements. 
    4348// 
    4449// Here are a few "power user" tips: 
     
    5964//         the hash table will never shrink. 
    6065// 
    61 // Guide to what kind of hash_map to use: 
    62 //   (1) dense_hash_map: fastest, uses the most memory 
     66// Roughly speaking: 
     67//   (1) dense_hash_map: fastest, uses the most memory unless entries are small 
    6368//   (2) sparse_hash_map: slowest, uses the least memory 
    6469//   (3) hash_map / unordered_map (STL): in the middle 
     70// 
    6571// Typically I use sparse_hash_map when I care about space and/or when 
    6672// I need to save the hashtable on disk.  I use hash_map otherwise.  I 
     
    6874// small maps with lots of lookups. 
    6975// 
    70 // - dense_hash_map has, typically, a factor of 2 memory overhead (if your 
    71 //   data takes up X bytes, the hash_map uses X more bytes in overhead). 
    72 // - sparse_hash_map has about 2 bits overhead per entry. 
     76// - dense_hash_map has, typically, about 78% memory overhead (if your 
     77//   data takes up X bytes, the hash_map uses .78X more bytes in overhead). 
     78// - sparse_hash_map has about 4 bits overhead per entry. 
    7379// - sparse_hash_map can be 3-7 times slower than the others for lookup and, 
    7480//   especially, inserts.  See time_hash_map.cc for details. 
     
    264270  } 
    265271  void clear_deleted_key()                    { rep.clear_deleted_key(); } 
     272  key_type deleted_key() const                { return rep.deleted_key(); } 
    266273 
    267274  // These are standard 
  • lang/ruby/ruby-bayon/trunk/sparsehash/google/sparse_hash_set

    r33943 r36827  
    4141// change the key, and for sets there is no value). 
    4242// 
    43 // We adhere mostly to the STL semantics for hash-set.  One important 
    44 // exception is that insert() invalidates iterators entirely.  On the 
    45 // plus side, though, delete() doesn't invalidate iterators at all, or 
    46 // even change the ordering of elements. 
     43// We adhere mostly to the STL semantics for hash-map.  One important 
     44// exception is that insert() may invalidate iterators entirely -- STL 
     45// semantics are that insert() may reorder iterators, but they all 
     46// still refer to something valid in the hashtable.  Not so for us. 
     47// Likewise, insert() may invalidate pointers into the hashtable. 
     48// (Whether insert invalidates iterators and pointers depends on 
     49// whether it results in a hashtable resize).  On the plus side, 
     50// delete() doesn't invalidate iterators or pointers at all, or even 
     51// change the ordering of elements. 
    4752// 
    4853// Here are a few "power user" tips: 
     
    6368//         the hash table will never shrink. 
    6469// 
    65 // Guide to what kind of hash_set to use: 
    66 //   (1) dense_hash_set: fastest, uses the most memory 
     70// Roughly speaking: 
     71//   (1) dense_hash_set: fastest, uses the most memory unless entries are small 
    6772//   (2) sparse_hash_set: slowest, uses the least memory 
    68 //   (3) hash_set /unordered_set (STL): in the middle 
     73//   (3) hash_set / unordered_set (STL): in the middle 
     74// 
    6975// Typically I use sparse_hash_set when I care about space and/or when 
    7076// I need to save the hashtable on disk.  I use hash_set otherwise.  I 
     
    7278// small sets with lots of lookups. 
    7379// 
    74 // - dense_hash_set has, typically, a factor of 2 memory overhead (if your 
    75 //   data takes up X bytes, the hash_set uses X more bytes in overhead). 
    76 // - sparse_hash_set has about 2 bits overhead per entry. 
    77 // - sparse_hash_map can be 3-7 times slower than the others for lookup and, 
     80// - dense_hash_set has, typically, about 78% memory overhead (if your 
     81//   data takes up X bytes, the hash_set uses .78X more bytes in overhead). 
     82// - sparse_hash_set has about 4 bits overhead per entry. 
     83// - sparse_hash_set can be 3-7 times slower than the others for lookup and, 
    7884//   especially, inserts.  See time_hash_map.cc for details. 
    7985// 
     
    246252  void set_deleted_key(const key_type& key)   { rep.set_deleted_key(key); } 
    247253  void clear_deleted_key()                    { rep.clear_deleted_key(); } 
     254  key_type deleted_key() const                { return rep.deleted_key(); } 
    248255 
    249256  // These are standard 
  • lang/ruby/ruby-bayon/trunk/sparsehash/google/sparsehash/densehashtable.h

    r33943 r36827  
    103103#include <stdlib.h>             // for abort() 
    104104#include <algorithm>            // For swap(), eg 
     105#include <stdexcept>            // For length_error 
    105106#include <iostream>             // For cerr 
    106107#include <memory>               // For uninitialized_fill, uninitialized_copy 
     
    328329  key_equal key_eq() const  { return equals; } 
    329330 
     331  // Accessor function for statistics gathering. 
     332  int num_table_copies() const { return num_ht_copies; } 
     333 
    330334 private: 
    331335  // Annoyingly, we can't copy values around, because they might have 
     
    358362  } 
    359363 
     364  bool test_deleted_key(const key_type& key) const { 
     365    // The num_deleted test is crucial for read(): after read(), the ht values 
     366    // are garbage, and we don't want to think some of them are deleted. 
     367    // Invariant: !use_deleted implies num_deleted is 0. 
     368    assert(use_deleted || num_deleted == 0); 
     369    return num_deleted > 0 && equals(delkey, key); 
     370  } 
     371 
    360372 public: 
    361373  void set_deleted_key(const key_type &key) { 
     
    372384    use_deleted = false; 
    373385  } 
     386  key_type deleted_key() const { 
     387    assert(use_deleted); 
     388    return delkey; 
     389  } 
    374390 
    375391  // These are public so the iterators can use them 
    376392  // True if the item at position bucknum is "deleted" marker 
    377393  bool test_deleted(size_type bucknum) const { 
    378     // The num_deleted test is crucial for read(): after read(), the ht values 
    379     // are garbage, and we don't want to think some of them are deleted. 
    380     return (use_deleted && num_deleted > 0 && 
    381             equals(delkey, get_key(table[bucknum]))); 
     394    return test_deleted_key(get_key(table[bucknum])); 
    382395  } 
    383396  bool test_deleted(const iterator &it) const { 
    384     return (use_deleted && num_deleted > 0 && 
    385             equals(delkey, get_key(*it))); 
     397    return test_deleted_key(get_key(*it)); 
    386398  } 
    387399  bool test_deleted(const const_iterator &it) const { 
    388     return (use_deleted && num_deleted > 0 && 
    389             equals(delkey, get_key(*it))); 
    390   } 
    391   // Set it so test_deleted is true.  true if object didn't used to be deleted 
    392   // See below (at erase()) to explain why we allow const_iterators 
     400    return test_deleted_key(get_key(*it)); 
     401  } 
     402 
     403  // Set it so test_deleted is true.  true if object didn't used to be deleted. 
     404  bool set_deleted(iterator &it) { 
     405    assert(use_deleted);             // bad if set_deleted_key() wasn't called 
     406    bool retval = !test_deleted(it); 
     407    // &* converts from iterator to value-type. 
     408    set_key(&(*it), delkey); 
     409    return retval; 
     410  } 
     411  // Set it so test_deleted is false.  true if object used to be deleted 
     412  bool clear_deleted(iterator &it) { 
     413    assert(use_deleted);             // bad if set_deleted_key() wasn't called 
     414    // Happens automatically when we assign something else in its place. 
     415    return test_deleted(it); 
     416  } 
     417 
     418  // We also allow to set/clear the deleted bit on a const iterator. 
     419  // We allow a const_iterator for the same reason you can delete a 
     420  // const pointer: it's convenient, and semantically you can't use 
     421  // 'it' after it's been deleted anyway, so its const-ness doesn't 
     422  // really matter. 
    393423  bool set_deleted(const_iterator &it) { 
    394424    assert(use_deleted);             // bad if set_deleted_key() wasn't called 
    395425    bool retval = !test_deleted(it); 
    396     // &* converts from iterator to value-type 
    397426    set_key(const_cast<value_type*>(&(*it)), delkey); 
    398427    return retval; 
     
    401430  bool clear_deleted(const_iterator &it) { 
    402431    assert(use_deleted);             // bad if set_deleted_key() wasn't called 
    403     // happens automatically when we assign something else in its place 
    404432    return test_deleted(it); 
    405433  } 
     
    460488    assert(table); 
    461489    fill_range_with_empty(table, table + num_buckets); 
     490  } 
     491  // TODO(sjackman): return a key_type rather than a value_type 
     492  value_type empty_key() const { 
     493    assert(use_empty); 
     494    return emptyval; 
    462495  } 
    463496 
     
    488521  size_type min_size(size_type num_elts, size_type min_buckets_wanted) { 
    489522    size_type sz = HT_MIN_BUCKETS;             // min buckets allowed 
    490     while ( sz < min_buckets_wanted || num_elts >= sz * enlarge_resize_percent ) 
     523    while ( sz < min_buckets_wanted || 
     524            num_elts >= static_cast<size_type>(sz * enlarge_resize_percent) ) { 
     525      if (sz * 2 < sz) 
     526        throw std::length_error("resize overflow");  // protect against overflow 
    491527      sz *= 2; 
     528    } 
    492529    return sz; 
    493530  } 
     
    534571    const size_type needed_size = min_size(num_elements + delta, 0); 
    535572    if ( needed_size > bucket_count() ) {      // we don't have enough buckets 
    536       const size_type resize_to = min_size(num_elements - num_deleted + delta, 
    537                                            0); 
     573      size_type resize_to = min_size(num_elements - num_deleted + delta, 
     574                                     bucket_count()); 
     575      if (resize_to < needed_size) { 
     576        // This situation means that we have enough deleted elements, 
     577        // that once we purge them, we won't actually have needed to 
     578        // grow.  But we may want to grow anyway: if we just purge one 
     579        // element, say, we'll have to grow anyway next time we 
     580        // insert.  Might as well grow now, since we're already going 
     581        // through the trouble of copying (in order to purge the 
     582        // deleted elements). 
     583        if (num_elements - num_deleted + delta >= 
     584            static_cast<size_type>(resize_to*2 * shrink_resize_percent)) { 
     585          // Good, we won't be below the shrink threshhold even if we double. 
     586          resize_to *= 2; 
     587        } 
     588      } 
    538589      dense_hashtable tmp(*this, resize_to); 
    539590      swap(tmp);                             // now we are tmp 
     
    600651      num_elements++; 
    601652    } 
     653    num_ht_copies++; 
    602654  } 
    603655 
     
    648700                  ? HT_DEFAULT_STARTING_BUCKETS 
    649701                  : min_size(expected_max_items_in_table, 0)), 
    650       num_elements(0) { 
     702      num_elements(0), num_ht_copies(0) { 
    651703    // table is NULL until emptyval is set.  However, we set num_buckets 
    652704    // here so we know how much space to allocate once emptyval is set 
     
    664716      enlarge_resize_percent(ht.enlarge_resize_percent), 
    665717      shrink_resize_percent(ht.shrink_resize_percent), table(NULL), 
    666       num_buckets(0), num_elements(0) { 
     718      num_buckets(0), num_elements(0), num_ht_copies(ht.num_ht_copies) { 
     719    if (!ht.use_empty) { 
     720      // If use_empty isn't set, copy_from will crash, so we do our own copying. 
     721      assert(ht.empty()); 
     722      num_buckets = min_size(ht.size(), min_buckets_wanted); 
     723      reset_thresholds(); 
     724      return; 
     725    } 
    667726    reset_thresholds(); 
    668727    copy_from(ht, min_buckets_wanted);   // copy_from() ignores deleted entries 
     
    671730  dense_hashtable& operator= (const dense_hashtable& ht) { 
    672731    if (&ht == this)  return *this;        // don't copy onto ourselves 
    673     clear(); 
     732    if (!ht.use_empty) { 
     733      assert(ht.empty()); 
     734      dense_hashtable empty_table(ht);  // empty table with ht's thresholds 
     735      this->swap(empty_table); 
     736      return *this; 
     737    } 
    674738    hash = ht.hash; 
    675739    equals = ht.equals; 
     
    682746    enlarge_resize_percent = ht.enlarge_resize_percent; 
    683747    shrink_resize_percent = ht.shrink_resize_percent; 
    684     copy_from(ht, HT_MIN_BUCKETS);         // sets num_deleted to 0 too 
     748    copy_from(ht, HT_MIN_BUCKETS);  // calls clear and sets num_deleted to 0 too 
    685749    return *this; 
    686750  } 
     
    713777    STL_NAMESPACE::swap(num_buckets, ht.num_buckets); 
    714778    STL_NAMESPACE::swap(num_elements, ht.num_elements); 
     779    STL_NAMESPACE::swap(num_ht_copies, ht.num_ht_copies); 
    715780    reset_thresholds(); 
    716781    ht.reset_thresholds(); 
     
    719784  // It's always nice to be able to clear a table without deallocating it 
    720785  void clear() { 
     786    const size_type new_num_buckets = min_size(0,0); 
     787    if (num_elements == 0 && 
     788        num_deleted == 0 && 
     789        new_num_buckets == num_buckets) { 
     790      // Table is already empty, and the number of buckets is already as we 
     791      // desire, so nothing to do. 
     792      return; 
     793    } 
    721794    if (table) 
    722795      destroy_buckets(0, num_buckets); 
    723     num_buckets = min_size(0,0);          // our new size 
    724     reset_thresholds(); 
    725     table = (value_type *) realloc(table, num_buckets * sizeof(*table)); 
     796    if (!table || (new_num_buckets != num_buckets)) { 
     797      // Recompute the resize thresholds and realloc the table only if we're 
     798      // actually changing its size. 
     799      num_buckets = new_num_buckets;          // our new size 
     800      reset_thresholds(); 
     801      table = (value_type *) realloc(table, num_buckets * sizeof(*table)); 
     802    } 
    726803    assert(table); 
    727804    fill_range_with_empty(table, table + num_buckets); 
     
    907984  } 
    908985 
    909   // This is really evil: really it should be iterator, not const_iterator. 
    910   // But...the only reason keys are const is to allow lookup. 
    911   // Since that's a moot issue for deleted keys, we allow const_iterators 
     986  // We return the iterator past the deleted item. 
     987  void erase(iterator pos) { 
     988    if ( pos == end() ) return;    // sanity check 
     989    if ( set_deleted(pos) ) {      // true if object has been newly deleted 
     990      ++num_deleted; 
     991      consider_shrink = true;      // will think about shrink after next insert 
     992    } 
     993  } 
     994 
     995  void erase(iterator f, iterator l) { 
     996    for ( ; f != l; ++f) { 
     997      if ( set_deleted(f)  )       // should always be true 
     998        ++num_deleted; 
     999    } 
     1000    consider_shrink = true;        // will think about shrink after next insert 
     1001  } 
     1002 
     1003  // We allow you to erase a const_iterator just like we allow you to 
     1004  // erase an iterator.  This is in parallel to 'delete': you can delete 
     1005  // a const pointer just like a non-const pointer.  The logic is that 
     1006  // you can't use the object after it's erased anyway, so it doesn't matter 
     1007  // if it's const or not. 
    9121008  void erase(const_iterator pos) { 
    9131009    if ( pos == end() ) return;    // sanity check 
     
    9171013    } 
    9181014  } 
    919  
    9201015  void erase(const_iterator f, const_iterator l) { 
    9211016    for ( ; f != l; ++f) { 
     
    10201115  size_type num_buckets; 
    10211116  size_type num_elements; 
     1117  int num_ht_copies;             // a statistics counter incremented every Copy 
    10221118  bool consider_shrink;   // true if we should try to shrink before next insert 
    10231119 
  • lang/ruby/ruby-bayon/trunk/sparsehash/google/sparsehash/sparsehashtable.h

    r33943 r36827  
    111111#include <assert.h> 
    112112#include <algorithm>              // For swap(), eg 
     113#include <stdexcept>              // For length_error 
    113114#include <iterator>               // for facts about iterator tags 
    114115#include <utility>                // for pair<> 
     
    403404  key_equal key_eq() const  { return equals; } 
    404405 
     406  // Accessor function for statistics gathering. 
     407  int num_table_copies() const { return num_ht_copies; } 
     408 
    405409 private: 
    406410  // We need to copy values when we set the special marker for deleted 
     
    447451    use_deleted = false; 
    448452  } 
     453  key_type deleted_key() const { 
     454    assert(use_deleted); 
     455    return delkey; 
     456  } 
    449457 
    450458  // These are public so the iterators can use them 
     
    468476            equals(delkey, get_key(*it))); 
    469477  } 
    470   // Set it so test_deleted is true.  true if object didn't used to be deleted 
    471   // See below (at erase()) to explain why we allow const_iterators 
     478  // Set it so test_deleted is true.  true if object didn't used to be deleted. 
     479  bool set_deleted(iterator &it) { 
     480    assert(use_deleted); 
     481    bool retval = !test_deleted(it); 
     482    // &* converts from iterator to value-type. 
     483    set_key(&(*it), delkey); 
     484    return retval; 
     485  } 
     486  // Set it so test_deleted is false.  true if object used to be deleted. 
     487  bool clear_deleted(iterator &it) { 
     488    assert(use_deleted); 
     489    // Happens automatically when we assign something else in its place. 
     490    return test_deleted(it); 
     491  } 
     492 
     493  // We also allow to set/clear the deleted bit on a const iterator. 
     494  // We allow a const_iterator for the same reason you can delete a 
     495  // const pointer: it's convenient, and semantically you can't use 
     496  // 'it' after it's been deleted anyway, so its const-ness doesn't 
     497  // really matter. 
    472498  bool set_deleted(const_iterator &it) { 
    473499    assert(use_deleted);             // bad if set_deleted_key() wasn't called 
    474500    bool retval = !test_deleted(it); 
    475     // &* converts from iterator to value-type 
    476501    set_key(const_cast<value_type*>(&(*it)), delkey); 
    477502    return retval; 
    478503  } 
    479   // Set it so test_deleted is false.  true if object used to be deleted 
    480504  bool clear_deleted(const_iterator &it) { 
    481505    assert(use_deleted);             // bad if set_deleted_key() wasn't called 
    482     // happens automatically when we assign something else in its place 
    483506    return test_deleted(it); 
    484507  } 
     
    507530  // If you like, you can give a min #buckets as well as a min #elts 
    508531  size_type min_size(size_type num_elts, size_type min_buckets_wanted) { 
    509     size_type sz = HT_MIN_BUCKETS; 
    510     while ( sz < min_buckets_wanted || num_elts >= sz * enlarge_resize_percent ) 
     532    size_type sz = HT_MIN_BUCKETS;             // min buckets allowed 
     533    while ( sz < min_buckets_wanted || 
     534            num_elts >= static_cast<size_type>(sz * enlarge_resize_percent) ) { 
     535      if (sz * 2 < sz) 
     536        throw std::length_error("resize overflow");  // protect against overflow 
    511537      sz *= 2; 
     538    } 
    512539    return sz; 
    513540  } 
     
    524551    // like "dense_hash_set<int> x; x.insert(4); x.erase(4);" will 
    525552    // shrink us down to HT_MIN_BUCKETS buckets, which is too small. 
    526     if (shrink_threshold > 0 
    527         && (table.num_nonempty()-num_deleted) < shrink_threshold && 
    528         bucket_count() > HT_DEFAULT_STARTING_BUCKETS ) { 
     553    if (shrink_threshold > 0 && 
     554        (table.num_nonempty()-num_deleted) < shrink_threshold && 
     555        bucket_count() > HT_DEFAULT_STARTING_BUCKETS) { 
    529556      size_type sz = bucket_count() / 2;    // find how much we should shrink 
    530557      while ( sz > HT_DEFAULT_STARTING_BUCKETS && 
     
    555582    const size_type needed_size = min_size(table.num_nonempty() + delta, 0); 
    556583    if ( needed_size > bucket_count() ) {      // we don't have enough buckets 
    557       const size_type resize_to = min_size(table.num_nonempty() - num_deleted 
    558                                            + delta, 0); 
     584      size_type resize_to = min_size(table.num_nonempty() - num_deleted + delta, 
     585                                     bucket_count()); 
     586      if (resize_to < needed_size) { 
     587        // This situation means that we have enough deleted elements, 
     588        // that once we purge them, we won't actually have needed to 
     589        // grow.  But we may want to grow anyway: if we just purge one 
     590        // element, say, we'll have to grow anyway next time we 
     591        // insert.  Might as well grow now, since we're already going 
     592        // through the trouble of copying (in order to purge the 
     593        // deleted elements). 
     594        if (table.num_nonempty() - num_deleted + delta >= 
     595            static_cast<size_type>(resize_to*2 * shrink_resize_percent)) { 
     596          // Good, we won't be below the shrink threshhold even if we double. 
     597          resize_to *= 2; 
     598        } 
     599      } 
     600 
    559601      sparse_hashtable tmp(MoveDontCopy, *this, resize_to); 
    560602      swap(tmp);                             // now we are tmp 
     
    589631      table.set(bucknum, *it);               // copies the value to here 
    590632    } 
     633    num_ht_copies++; 
    591634  } 
    592635 
     
    626669      table.set(bucknum, *it);               // copies the value to here 
    627670    } 
     671    num_ht_copies++; 
    628672  } 
    629673 
     
    673717      table(expected_max_items_in_table == 0 
    674718            ? HT_DEFAULT_STARTING_BUCKETS 
    675             : min_size(expected_max_items_in_table, 0)) { 
     719            : min_size(expected_max_items_in_table, 0)), 
     720      num_ht_copies(0) { 
    676721    reset_thresholds(); 
    677722  } 
     
    688733      enlarge_resize_percent(ht.enlarge_resize_percent), 
    689734      shrink_resize_percent(ht.shrink_resize_percent), 
    690       table() { 
     735      table(), num_ht_copies(ht.num_ht_copies) { 
    691736    reset_thresholds(); 
    692737    copy_from(ht, min_buckets_wanted);   // copy_from() ignores deleted entries 
     
    698743      enlarge_resize_percent(ht.enlarge_resize_percent), 
    699744      shrink_resize_percent(ht.shrink_resize_percent), 
    700       table() { 
     745      table(), num_ht_copies(ht.num_ht_copies) { 
    701746    reset_thresholds(); 
    702747    move_from(mover, ht, min_buckets_wanted);  // ignores deleted entries 
     
    705750  sparse_hashtable& operator= (const sparse_hashtable& ht) { 
    706751    if (&ht == this)  return *this;        // don't copy onto ourselves 
    707     clear(); 
    708752    hash = ht.hash; 
    709753    equals = ht.equals; 
     
    712756    use_deleted = ht.use_deleted; 
    713757    delkey = ht.delkey; 
    714     copy_from(ht, HT_MIN_BUCKETS);         // sets num_deleted to 0 too 
     758    copy_from(ht, HT_MIN_BUCKETS);  // calls clear and sets num_deleted to 0 too 
    715759    return *this; 
    716760  } 
     
    728772    STL_NAMESPACE::swap(delkey, ht.delkey); 
    729773    table.swap(ht.table); 
     774    STL_NAMESPACE::swap(num_ht_copies, ht.num_ht_copies); 
    730775    reset_thresholds(); 
    731776    ht.reset_thresholds(); 
     
    734779  // It's always nice to be able to clear a table without deallocating it 
    735780  void clear() { 
    736     table.clear(); 
     781    if (!empty() || (num_deleted != 0)) { 
     782      table.clear(); 
     783    } 
    737784    reset_thresholds(); 
    738785    num_deleted = 0; 
     
    888935  // DELETION ROUTINES 
    889936  size_type erase(const key_type& key) { 
    890     // First, double-check we're not erasing delkey 
     937    // First, double-check we're not erasing delkey. 
    891938    assert(!use_deleted || !equals(key, delkey)); 
    892939    const_iterator pos = find(key);   // shrug: shouldn't need to be const 
     
    902949  } 
    903950 
    904   // This is really evil: really it should be iterator, not const_iterator. 
    905   // But...the only reason keys are const is to allow lookup. 
    906   // Since that's a moot issue for deleted keys, we allow const_iterators 
     951  // We return the iterator past the deleted item. 
     952  void erase(iterator pos) { 
     953    if ( pos == end() ) return;    // sanity check 
     954    if ( set_deleted(pos) ) {      // true if object has been newly deleted 
     955      ++num_deleted; 
     956      consider_shrink = true;      // will think about shrink after next insert 
     957    } 
     958  } 
     959 
     960  void erase(iterator f, iterator l) { 
     961    for ( ; f != l; ++f) { 
     962      if ( set_deleted(f)  )       // should always be true 
     963        ++num_deleted; 
     964    } 
     965    consider_shrink = true;        // will think about shrink after next insert 
     966  } 
     967 
     968  // We allow you to erase a const_iterator just like we allow you to 
     969  // erase an iterator.  This is in parallel to 'delete': you can delete 
     970  // a const pointer just like a non-const pointer.  The logic is that 
     971  // you can't use the object after it's erased anyway, so it doesn't matter 
     972  // if it's const or not. 
    907973  void erase(const_iterator pos) { 
    908974    if ( pos == end() ) return;    // sanity check 
     
    912978    } 
    913979  } 
    914  
    915980  void erase(const_iterator f, const_iterator l) { 
    916981    for ( ; f != l; ++f) { 
     
    9771042  sparsetable<value_type> table;      // holds num_buckets and num_elements too 
    9781043  bool consider_shrink;   // true if we should try to shrink before next insert 
     1044  int num_ht_copies;        // a statistics counter incremented every Copy/Move 
    9791045 
    9801046  void reset_thresholds() { 
  • lang/ruby/ruby-bayon/trunk/sparsehash/google/sparsetable

    r33943 r36827  
    739739  // We need a small function that tells us how many set bits there are 
    740740  // in positions 0..i-1 of the bitmap.  It uses a big table. 
    741   // We make it static so templates don't allocate lots of these tables 
     741  // We make it static so templates don't allocate lots of these tables. 
     742  // There are lots of ways to do this calculation (called 'popcount'). 
     743  // The 8-bit table lookup is one of the fastest, though this 
     744  // implementation suffers from not doing any loop unrolling.  See, eg, 
     745  //   http://www.dalkescientific.com/writings/diary/archive/2008/07/03/hakmem_and_other_popcounts.html 
     746  //   http://gurmeetsingh.wordpress.com/2008/08/05/fast-bit-counting-routines/ 
    742747  static size_type pos_to_offset(const unsigned char *bm, size_type pos) { 
    743748    // We could make these ints.  The tradeoff is size (eg does it overwhelm