Merge pull request #116 from xosh/feature_wt

Feature wt
simongog · Sep 26, 2013 · 6ac2394 · 6ac2394
2 parents 3aed14a + f8db199
commit 6ac2394
Show file tree

Hide file tree

Showing 5 changed files with 523 additions and 153 deletions.
diff --git a/include/sdsl/wt_helper.hpp b/include/sdsl/wt_helper.hpp
@@ -526,6 +526,9 @@ struct _int_tree {
 
     //! Return the path as left/right bit sequence in a uint64_t
     inline uint64_t bit_path(value_type c)const {
+        if (c >= m_path.size()) {
+            return m_path.size()-1;
+        }
         return m_path[c];
     }
 

diff --git a/include/sdsl/wt_int.hpp b/include/sdsl/wt_int.hpp
@@ -72,6 +72,7 @@ class wt_int
         typedef t_select_zero                        select_0_type;
         typedef wt_tag                               index_category;
         typedef int_alphabet_tag                     alphabet_category;
+        enum 	{lex_ordered=1};
 
         typedef std::pair<value_type, size_type>     point_type;
         typedef std::vector<point_type>              point_vec_type;
@@ -111,6 +112,48 @@ class wt_int
             m_path_rank_off = int_vector<64>(max_depth+1);
         }
 
+        // recursive internal version of the method interval_symbols
+        void _interval_symbols(size_type i, size_type j, size_type& k,
+                               std::vector<value_type>& cs,
+                               std::vector<size_type>& rank_c_i,
+                               std::vector<size_type>& rank_c_j,
+                               size_type depth,
+                               size_type path,
+                               size_type node_size,
+                               size_type offset) const {
+            // invariant: j>i
+
+            if (depth >= m_max_depth) {
+                rank_c_i[k]= i;
+                rank_c_j[k]= j;
+                cs[k++]= path;
+                return;
+            }
+
+            size_type ones_before_o = m_tree_rank(offset);
+            size_type ones_before_i = m_tree_rank(offset+i) - ones_before_o;
+            size_type ones_before_j = m_tree_rank(offset+j) - ones_before_o;
+            size_type ones_before_end = m_tree_rank(offset+ node_size) - ones_before_o;
+
+            // goto left child
+            if ((j-i)-(ones_before_j-ones_before_i)>0) {
+                size_type new_offset = offset + m_size;
+                size_type new_node_size = node_size - ones_before_end;
+                size_type new_i = i - ones_before_i;
+                size_type new_j = j - ones_before_j;
+                _interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, depth+1, path<<1, new_node_size, new_offset);
+            }
+
+            // goto right child
+            if ((ones_before_j-ones_before_i)>0) {
+                size_type new_offset = offset+(node_size - ones_before_end) + m_size;
+                size_type new_node_size = ones_before_end;
+                size_type new_i = ones_before_i;
+                size_type new_j = ones_before_j;
+                _interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, depth+1, (path<<1)|1, new_node_size, new_offset);
+            }
+        }
+
     public:
 
         const size_type&       sigma = m_sigma; //!< Effective alphabet size of the wavelet tree.
@@ -263,8 +306,10 @@ class wt_int
         }
 
         //! Recovers the i-th symbol of the original vector.
-        /*! \param i The index of the symbol in the original vector. \f$i \in [0..size()-1]\f$
+        /*! \param i The index of the symbol in the original vector.
          *  \returns The i-th symbol of the original vector.
+         *  \par Precondition
+         *       \f$ i < size() \f$
          */
         value_type operator[](size_type i)const {
             assert(i < size());
@@ -296,12 +341,17 @@ class wt_int
          *  \param c The symbol to count the occurrences in the prefix.
          *    \returns The number of occurrences of symbol c in the prefix [0..i-1] of the supported vector.
          *  \par Time complexity
-         *        \f$ \Order{\log |\Sigma|} \f$
+         *       \f$ \Order{\log |\Sigma|} \f$
+         *  \par Precondition
+         *       \f$ i \leq size() \f$
          */
         size_type rank(size_type i, value_type c)const {
             assert(i <= size());
+            if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
+                return 0;
+            }
             size_type offset = 0;
-            uint64_t mask     = (1ULL) << (m_max_depth-1);
+            uint64_t mask = (1ULL) << (m_max_depth-1);
             size_type node_size = m_size;
             for (uint32_t k=0; k < m_max_depth and i; ++k) {
                 size_type ones_before_o   = m_tree_rank(offset);
@@ -327,27 +377,48 @@ class wt_int
         /*!
          *  \param i The index of the symbol.
          *  \return  Pair (rank(wt[i],i),wt[i])
+         *  \par Precondition
+         *       \f$ i < size() \f$
          */
         std::pair<size_type, value_type>
         inverse_select(size_type i)const {
             assert(i < size());
-            value_type c = (*this)[i];
-            return std::make_pair(rank(i, c),c);
+
+            value_type c = 0;
+            size_type node_size = m_size, offset = 0;
+            for (uint32_t k=0; k < m_max_depth; ++k) {
+                size_type ones_before_o   = m_tree_rank(offset);
+                size_type ones_before_i   = m_tree_rank(offset + i) - ones_before_o;
+                size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
+                c<<=1;
+                if (m_tree[offset+i]) { // go to the right child
+                    offset += (node_size - ones_before_end);
+                    node_size = ones_before_end;
+                    i = ones_before_i;
+                    c|=1;
+                } else { // go to the left child
+                    node_size = (node_size - ones_before_end);
+                    i = (i-ones_before_i);
+                }
+                offset += m_size;
+            }
+            return std::make_pair(i,c);
         }
 
         //! Calculates the i-th occurrence of the symbol c in the supported vector.
         /*!
-         *  \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$.
+         *  \param i The i-th occurrence.
          *  \param c The symbol c.
          *  \par Time complexity
-         *        \f$ \Order{\log |\Sigma|} \f$
+         *       \f$ \Order{\log |\Sigma|} \f$
+         *  \par Precondition
+         *       \f$ 1 \leq i \leq rank(size(), c) \f$
          */
         size_type select(size_type i, value_type c)const {
-            assert(i > 0);
-            assert(i <= rank(size(), c));
+            assert(1 <= i and i <= rank(size(), c));
             // possible optimization: if the array is a permutation we can start at the bottom of the tree
             size_type offset = 0;
-            uint64_t mask     = (1ULL) << (m_max_depth-1);
+            uint64_t mask    = (1ULL) << (m_max_depth-1);
             size_type node_size = m_size;
             m_path_off[0] = m_path_rank_off[0] = 0;
 
@@ -383,6 +454,136 @@ class wt_int
             return i-1;
         };
 
+
+        //! For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
+        /*!
+         * \param i        The start index (inclusive) of the interval.
+         * \param j        The end index (exclusive) of the interval.
+         * \param k        Reference for number of different symbols in [i..j-1].
+         * \param cs       Reference to a vector that will contain in
+         *                 cs[0..k-1] all symbols that occur in [i..j-1] in
+         *                 ascending order.
+         * \param rank_c_i Reference to a vector which equals
+         *                 rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$.
+         * \param rank_c_j Reference to a vector which equals
+         *                 rank_c_j[p] = rank(j,cs[p]), for \f$ 0 \leq p < k \f$.
+         * \par Time complexity
+         *      \f$ \Order{\min{\sigma, k \log \sigma}} \f$
+         *
+         * \par Precondition
+         *      \f$ i \leq j \leq size() \f$
+         *      \f$ cs.size() \geq \sigma \f$
+         *      \f$ rank_{c_i}.size() \geq \sigma \f$
+         *      \f$ rank_{c_j}.size() \geq \sigma \f$
+         */
+        void interval_symbols(size_type i, size_type j, size_type& k,
+                              std::vector<value_type>& cs,
+                              std::vector<size_type>& rank_c_i,
+                              std::vector<size_type>& rank_c_j) const {
+            assert(i <= j and j <= size());
+            k=0;
+            if (i==j) {
+                return;
+            }
+            if ((i+1)==j) {
+                auto res = inverse_select(i);
+                cs[0]=res.second;
+                rank_c_i[0]=res.first;
+                rank_c_j[0]=res.first+1;
+                k=1;
+                return;
+            }
+
+            _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0, 0, m_size, 0);
+
+        }
+
+        //! How many symbols are lexicographic smaller/greater than c in [i..j-1].
+        /*!
+         * \param i       Start index (inclusive) of the interval.
+         * \param j       End index (exclusive) of the interval.
+         * \param c       Symbol c.
+         * \return A triple containing:
+         *         * rank(c,i)
+         *         * #symbols smaller than c in [i..j-1]
+         *         * #symbols greater than c in [i..j-1]
+         *
+         * \par Precondition
+         *      \f$ i \leq j \leq size() \f$
+         */
+        template<class t_ret_type = std::tuple<size_type, size_type, size_type>>
+        t_ret_type lex_count(size_type i, size_type j, value_type c)const {
+            assert(i <= j and j <= size());
+            if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
+                return t_ret_type {0, j-i, 0};
+            }
+            size_type offset  = 0;
+            size_type smaller = 0;
+            size_type greater = 0;
+            uint64_t mask     = (1ULL) << (m_max_depth-1);
+            size_type node_size = m_size;
+            for (uint32_t k=0; k < m_max_depth; ++k) {
+                size_type ones_before_o   = m_tree_rank(offset);
+                size_type ones_before_i   = m_tree_rank(offset + i) - ones_before_o;
+                size_type ones_before_j   = m_tree_rank(offset + j) - ones_before_o;
+                size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
+                if (c & mask) { // search for a one at this level
+                    offset += (node_size - ones_before_end);
+                    node_size = ones_before_end;
+                    smaller += j-i-ones_before_j+ones_before_i;
+                    i = ones_before_i;
+                    j = ones_before_j;
+                } else { // search for a zero at this level
+                    node_size -= ones_before_end;
+                    greater += ones_before_j-ones_before_i;
+                    i -= ones_before_i;
+                    j -= ones_before_j;
+                }
+                offset += m_size;
+                mask >>= 1;
+            }
+            return t_ret_type {i, smaller, greater};
+        };
+
+        //! How many symbols are lexicographic smaller than c in [0..i-1].
+        /*!
+         * \param i Exclusive right bound of the range.
+         * \param c Symbol c.
+         * \return A tuple containing:
+         *         * rank(c,i)
+         *         * #symbols smaller than c in [0..i-1]
+         * \par Precondition
+         *      \f$ i \leq size() \f$
+         */
+        template<class t_ret_type = std::tuple<size_type, size_type>>
+        t_ret_type lex_smaller_count(size_type i, value_type c) const {
+            assert(i <= size());
+            if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
+                return t_ret_type {0, i};
+            }
+            size_type offset = 0;
+            size_type result = 0;
+            uint64_t mask    = (1ULL) << (m_max_depth-1);
+            size_type node_size = m_size;
+            for (uint32_t k=0; k < m_max_depth and i; ++k) {
+                size_type ones_before_o   = m_tree_rank(offset);
+                size_type ones_before_i   = m_tree_rank(offset + i) - ones_before_o;
+                size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
+                if (c & mask) { // search for a one at this level
+                    offset   += (node_size - ones_before_end);
+                    node_size = ones_before_end;
+                    result   += i - ones_before_i;
+                    i         = ones_before_i;
+                } else { // search for a zero at this level
+                    node_size = (node_size - ones_before_end);
+                    i        -= ones_before_i;
+                }
+                offset += m_size;
+                mask >>= 1;
+            }
+            return t_ret_type {i, result};
+        }
+
         //! range_search_2d searches points in the index interval [lb..rb] and value interval [vlb..vrb].
         /*! \param lb     Left bound of index interval (inclusive)
          *  \param rb     Right bound of index interval (inclusive)