////////////////////////////////////////////// // // HashMap.h // // // Notes: (at the end .. random thoughts) // ////////////////////////////////////////////// #ifndef __HASH_MAP_H__ #define __HASH_MAP_H__ #include <list> #include <vector> #include <utility> #include <string> namespace qutils { // ------------------------------------------------------------- // // Not he moist efficient way to compute primes // but serves for our purposes in HashMap implementation // // ------------------------------------------------------------- bool IsPrime (int k) { int sqrt_k = std::sqrtf (k); int factor = 2; bool prime = true; while (1) { if (factor > sqrt_k) break; // if the factor completely divides k, then k can not // be a prime // if remained is anything else, check the next factor // upto sqrt of k if ( (k % factor) == 0) { prime = false; break; } else factor++; // check the next factor } return (prime); } int NextPrime (int i) { // returns the next higher prime number // i can be any integer int k = i+1; if ( (k & 0x1) == 0) // means k is even k++; bool prime = IsPrime (k); while (!prime) { k += 2; // chekc all odds only prime = IsPrime (k); } return (k); } // --------------------------------------------------------------- template <class KEY> class HashFunctor { public: int operator() (const KEY& key) ; // provide no implmentation .. so that compiler cries when this one gets used }; // Providing some default hash functors // for key == int template <> class HashFunctor<int> { int operator() (int key) { return (key); } // use the mod operator f\in the caller method of the hash table as only hash table knows its size }; template <> class HashFunctor<unsigned int> { int operator() (int key) { return (key); } // use the mod operator f\in the caller method of the hash table as only hash table knows its size }; // this <> signifies that this specialization can be specified // without the use of any formal paramaters template <> class HashFunctor<char*> { public: int operator() (const char* key) { int len = strlen(key); int sum = 0; for (int i=0; i<len; i++) sum += key[i] * i; return (sum); } }; template <> class HashFunctor<std::string> { public: int operator() (const std::string& key) { HashFunctor<char*> hf; return (hf (key.c_str() ) ); } }; //================================================================================================== template <typename KEY, typename T, typename HFNTR = HashFunctor<KEY> > class HashMap { friend class iterator; // disable copy CTOR and assigment operators??? should we ? // ANS: follow what the std::map does! // later int _capacity; // total slots int _size; // currently occupied slots // not using right now though typedef std::pair<KEY, T> KeyTPair; // this is used so that interface is the same as that of the std::map // otherwise we could easily have stored only T // there os another very importnat reason for storing pairs of Key and T // and that is, two difference keys may map to the same index in the hash table // therefore we must do a search, matching keys in that index to get the correct typedef std::list<KeyTPair> KeyTPairs; // keeping the same name to avoid lotsa' changes typedef std::vector<KeyTPairs> KeyTPairsVec; KeyTPairsVec _cells; HFNTR _hashFunctor; public: // CTORS // explicit HashMap (int capacity = 101) : _size(0), _capacity (capacity), _cells (capacity) {} class iterator // class name is small to conform with stl { typedef typename HashMap <KEY, T, HFNTR> ThisHashMap; ThisHashMap* _map; int _currentIndex; // index in the vecotr at which we will be iterating the list // each vector slot contains a list ThisHashMap::KeyTPairs::iterator _it; // the actual iteratr over the KeyTPairs; public: // defualt CTOR which does nothing and will infact fault if tried to use un-initialized // a properly initialized one can only be obtained form maps' end() and begin() methods // and then can be succesfully used in ++ operator style! iterator() : _currentIndex (0), _map (NULL) {} // purposely leaving it unitialized; // this iterator CTOr is used only by begin() and end() methods from the HashMap class // it is not to be used by others iterator (ThisHashMap* map, int currentIndex, ThisHashMap::KeyTPairs::iterator it) : _map (map), _currentIndex (0), _it (it) { } const iterator& operator = (const iterator& iter) { _map = iter._map; _currentIndex = iter._currentIndex; _it = iter._it; return (*this); } iterator (const iterator& iter) // copy CTOR { ( (*this) = iter ); // do via operator = } bool operator == (const iterator& iter) const { if (_map == iter._map && _currentIndex == iter._currentIndex && _it == iter._it) return (true); else return (false); } bool operator != (const iterator& iter) const { return (! operator == (iter)); } typename ThisHashMap::KeyTPair* operator -> () { return ( & (*_it) ); } typename ThisHashMap::KeyTPair& operator * () { return (*_it); } // type conversion operator operator ThisHashMap::KeyTPairs::iterator& () { return (_it); } iterator& operator++ () // pre operator { assert (_map); // that is, it is a valid iterator ++ _it; if ( _it == _map->_cells[_currentIndex].end() ) { // if end of current list, move to next slot ++_currentIndex; // now keep moving to subsequent slots as long as there are // no entries OR we reach the end of the // if it is the last cell, we are done traversing the hash map while (_currentIndex < _map->_capacity && _map->_cells[_currentIndex].size() == 0) ++_currentIndex; if (_currentIndex == _map->_capacity) (*this) = _map->end(); // this is the same as the last lists's end() else _it = _map->_cells[_currentIndex].begin(); } return (*this); } // type conversion operator to _it operator ThisHashMap::KeyTPairs::iterator () { return (_it); } }; // end iterator class // HashMap methods again // iterator begin () { iterator it(this, 0, _cells[0].begin()); if (_cells[0].size() == 0 ) ++it; // ++it will either make the iterator point to the next valid one // OR, it will make it equal to end! return (it); } iterator end () { iterator it ( this, _capacity-1, _cells[_capacity - 1].end() ); return (it); } // we do this so that we can return a valid 'end' iterator from find routine if it fails // operator [] returns the list for that list; // i.e given a key value, it returns the list that can possibly contain that value // once we obtain the list, we can do search, insertion or removal // operator [] // // returnes reference to the first T found with key 'key' // if no such entry exists, one is created and reference to it is returned T& operator [] (const KEY& key) { // use key to find the index iterator it = find (key); if ( it == end()) { T t; //note: do not do t() ... as () is not guaranteed to be present for all types // and it was giving weird results it = insert ( std::make_pair (key, t) ); } return ( it->second); } // direct methods for insertion, removal and search iterator find (const KEY& key) { int index = _hashFunctor (key); index = index % _capacity; KeyTPairs& l = _cells[index]; // now two different keys could map to same index // therefore we do a search in this list, matching the keys // (also this is the reason we store the key with the data) // and return the first such match found // note: our hash map can hold multiple entries with the same key // use insert methods to add multiples of such // operator[] will return erference to the first such entry // to obtain others, use find() method to return an iterator // and then use ++ operator on the iterator KeyTPairs::iterator it = l.begin(); while ( it != l.end() ) { if (it->first == key) break; else ++it; } if (it == l.end() ) return (end()); else { iterator it_return (this, index, it); return (it_return); } } // remove all items matching this key // returns the number of items removed int erase (const KEY& key) { int num_removed = 0; int index = _hashFunctor (key); index = index % _capacity; KeyTPairs& l = _cells[index]; // that is, we keep hold of a refernce to the list // now two different keys could map to same index // therefore we doa search in this list, matching the keys // and one by one remove all enteries that have this key KeyTPairs::iterator it = l.begin(); while ( it != l.end() ) { if (it->first == key) { it = l.erase (it); num_removed++; } } size -= num_removed; return (num_removed); } iterator insert ( const KeyTPair& key_t_pair ) { // this one does not use operator [], therefore this method // can be used to insert duplicates int index = _hashFunctor (key_t_pair.first); index = index % _capacity; KeyTPairs& l = _cells[index]; l.push_back (key_t_pair); _size++; if (_size > (0.67f * _capacity)) resize(); // whenever size approaches 67% of total capacity, it is time to to rehash KeyTPairs::iterator list_iter = l.end(); //iterator it = l.end(); -- list_iter; //std::advance (list_iter, -1); // actualy both techniques are ok // this iterator now points to the pair we just inserted // we now use this iterator to generate a HashMap::iterator iterator it_return (this, index, list_iter); return (it_return); } void resize() { // first make a temporary copy of the exisitng hash table HashTable tmp_table (_capacity); // create a temporary new hash table of the same size as existing one // and copy the existing one here! iterator it = begin(); while (it != end()) { tmp_table.insert (*it); ++it; } // now clear the existing one _cells.clear(); _capacity = NextPrime (2 * _capacity); _cells.resize ( _capacity); it = tmp_table.begin(); while (it != tmp_table.end()) { insert (*it); ++it; } } }; } // end namespace qutils #endif // // Approach: // --------- // // // Bascially a HashTable works as follows // // Data is represented as pairs of key, values // // In a map, data is stored sorted on the < criteria (by default) based on the key // and given a key, the map is binary searched to retrieve the value // // In a hash table, an algorthm is run on the key, caleld the hash function // The result of the hash function is an integer value // If algorithm can be chosen carefully such that the likelyhood of // two different keys generating the same number is very very less // // the key, value pairs are actually stored in an array // and the above number is MOD (%) with the table size to give the index // at which the pair is stored! // // Hash Algos: // for integer keys, a simplest one is the Mod operation directly! // there are ofcrouse others, like xorring the individual bits of the number etc.. // for string keys, summ (char[i] *i ) is a good one. // // However two or more keys CAN map to same integer result // so at each slot rather than storing the pair directly, we have a list // and then we just search that list for the pair! // //////////////////////////////////////////////////////// ///////////////////////////////////////////////////// // in the following // T = type of data to store // KEY = the type of the key, (number, int, strings ..etc // H = Hash functor // // since the size of the hash table must be resizeable we are // not passing in that as a paramter // And also note that N should be a prime number to avooid collisions // // the general usage model should be similar to that of the map (from Strouse Strup'sidea) // //////////////////////////////////////////////////// // // signature of the hash function // // A hash function/functor should accept a key type and return an index // // note: since the functor does not know about the size of the hash table // we simply return the number computed // And before using it, we do the % i nthe caller (from within hash table method i.e.) // the sgnature of the functor should simply look like