Skip to content

Commit

Permalink
Merge pull request #116 from xosh/feature_wt
Browse files Browse the repository at this point in the history
Feature wt
  • Loading branch information
simongog committed Sep 26, 2013
2 parents 3aed14a + f8db199 commit 6ac2394
Show file tree
Hide file tree
Showing 5 changed files with 523 additions and 153 deletions.
3 changes: 3 additions & 0 deletions include/sdsl/wt_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,9 @@ struct _int_tree {

//! Return the path as left/right bit sequence in a uint64_t
inline uint64_t bit_path(value_type c)const {
if (c >= m_path.size()) {
return m_path.size()-1;
}
return m_path[c];
}

Expand Down
221 changes: 211 additions & 10 deletions include/sdsl/wt_int.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class wt_int
typedef t_select_zero select_0_type;
typedef wt_tag index_category;
typedef int_alphabet_tag alphabet_category;
enum {lex_ordered=1};

typedef std::pair<value_type, size_type> point_type;
typedef std::vector<point_type> point_vec_type;
Expand Down Expand Up @@ -111,6 +112,48 @@ class wt_int
m_path_rank_off = int_vector<64>(max_depth+1);
}

// recursive internal version of the method interval_symbols
void _interval_symbols(size_type i, size_type j, size_type& k,
std::vector<value_type>& cs,
std::vector<size_type>& rank_c_i,
std::vector<size_type>& rank_c_j,
size_type depth,
size_type path,
size_type node_size,
size_type offset) const {
// invariant: j>i

if (depth >= m_max_depth) {
rank_c_i[k]= i;
rank_c_j[k]= j;
cs[k++]= path;
return;
}

size_type ones_before_o = m_tree_rank(offset);
size_type ones_before_i = m_tree_rank(offset+i) - ones_before_o;
size_type ones_before_j = m_tree_rank(offset+j) - ones_before_o;
size_type ones_before_end = m_tree_rank(offset+ node_size) - ones_before_o;

// goto left child
if ((j-i)-(ones_before_j-ones_before_i)>0) {
size_type new_offset = offset + m_size;
size_type new_node_size = node_size - ones_before_end;
size_type new_i = i - ones_before_i;
size_type new_j = j - ones_before_j;
_interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, depth+1, path<<1, new_node_size, new_offset);
}

// goto right child
if ((ones_before_j-ones_before_i)>0) {
size_type new_offset = offset+(node_size - ones_before_end) + m_size;
size_type new_node_size = ones_before_end;
size_type new_i = ones_before_i;
size_type new_j = ones_before_j;
_interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, depth+1, (path<<1)|1, new_node_size, new_offset);
}
}

public:

const size_type& sigma = m_sigma; //!< Effective alphabet size of the wavelet tree.
Expand Down Expand Up @@ -263,8 +306,10 @@ class wt_int
}

//! Recovers the i-th symbol of the original vector.
/*! \param i The index of the symbol in the original vector. \f$i \in [0..size()-1]\f$
/*! \param i The index of the symbol in the original vector.
* \returns The i-th symbol of the original vector.
* \par Precondition
* \f$ i < size() \f$
*/
value_type operator[](size_type i)const {
assert(i < size());
Expand Down Expand Up @@ -296,12 +341,17 @@ class wt_int
* \param c The symbol to count the occurrences in the prefix.
* \returns The number of occurrences of symbol c in the prefix [0..i-1] of the supported vector.
* \par Time complexity
* \f$ \Order{\log |\Sigma|} \f$
* \f$ \Order{\log |\Sigma|} \f$
* \par Precondition
* \f$ i \leq size() \f$
*/
size_type rank(size_type i, value_type c)const {
assert(i <= size());
if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
return 0;
}
size_type offset = 0;
uint64_t mask = (1ULL) << (m_max_depth-1);
uint64_t mask = (1ULL) << (m_max_depth-1);
size_type node_size = m_size;
for (uint32_t k=0; k < m_max_depth and i; ++k) {
size_type ones_before_o = m_tree_rank(offset);
Expand All @@ -327,27 +377,48 @@ class wt_int
/*!
* \param i The index of the symbol.
* \return Pair (rank(wt[i],i),wt[i])
* \par Precondition
* \f$ i < size() \f$
*/
std::pair<size_type, value_type>
inverse_select(size_type i)const {
assert(i < size());
value_type c = (*this)[i];
return std::make_pair(rank(i, c),c);

value_type c = 0;
size_type node_size = m_size, offset = 0;
for (uint32_t k=0; k < m_max_depth; ++k) {
size_type ones_before_o = m_tree_rank(offset);
size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o;
size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
c<<=1;
if (m_tree[offset+i]) { // go to the right child
offset += (node_size - ones_before_end);
node_size = ones_before_end;
i = ones_before_i;
c|=1;
} else { // go to the left child
node_size = (node_size - ones_before_end);
i = (i-ones_before_i);
}
offset += m_size;
}
return std::make_pair(i,c);
}

//! Calculates the i-th occurrence of the symbol c in the supported vector.
/*!
* \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$.
* \param i The i-th occurrence.
* \param c The symbol c.
* \par Time complexity
* \f$ \Order{\log |\Sigma|} \f$
* \f$ \Order{\log |\Sigma|} \f$
* \par Precondition
* \f$ 1 \leq i \leq rank(size(), c) \f$
*/
size_type select(size_type i, value_type c)const {
assert(i > 0);
assert(i <= rank(size(), c));
assert(1 <= i and i <= rank(size(), c));
// possible optimization: if the array is a permutation we can start at the bottom of the tree
size_type offset = 0;
uint64_t mask = (1ULL) << (m_max_depth-1);
uint64_t mask = (1ULL) << (m_max_depth-1);
size_type node_size = m_size;
m_path_off[0] = m_path_rank_off[0] = 0;

Expand Down Expand Up @@ -383,6 +454,136 @@ class wt_int
return i-1;
};


//! For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
/*!
* \param i The start index (inclusive) of the interval.
* \param j The end index (exclusive) of the interval.
* \param k Reference for number of different symbols in [i..j-1].
* \param cs Reference to a vector that will contain in
* cs[0..k-1] all symbols that occur in [i..j-1] in
* ascending order.
* \param rank_c_i Reference to a vector which equals
* rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$.
* \param rank_c_j Reference to a vector which equals
* rank_c_j[p] = rank(j,cs[p]), for \f$ 0 \leq p < k \f$.
* \par Time complexity
* \f$ \Order{\min{\sigma, k \log \sigma}} \f$
*
* \par Precondition
* \f$ i \leq j \leq size() \f$
* \f$ cs.size() \geq \sigma \f$
* \f$ rank_{c_i}.size() \geq \sigma \f$
* \f$ rank_{c_j}.size() \geq \sigma \f$
*/
void interval_symbols(size_type i, size_type j, size_type& k,
std::vector<value_type>& cs,
std::vector<size_type>& rank_c_i,
std::vector<size_type>& rank_c_j) const {
assert(i <= j and j <= size());
k=0;
if (i==j) {
return;
}
if ((i+1)==j) {
auto res = inverse_select(i);
cs[0]=res.second;
rank_c_i[0]=res.first;
rank_c_j[0]=res.first+1;
k=1;
return;
}

_interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0, 0, m_size, 0);

}

//! How many symbols are lexicographic smaller/greater than c in [i..j-1].
/*!
* \param i Start index (inclusive) of the interval.
* \param j End index (exclusive) of the interval.
* \param c Symbol c.
* \return A triple containing:
* * rank(c,i)
* * #symbols smaller than c in [i..j-1]
* * #symbols greater than c in [i..j-1]
*
* \par Precondition
* \f$ i \leq j \leq size() \f$
*/
template<class t_ret_type = std::tuple<size_type, size_type, size_type>>
t_ret_type lex_count(size_type i, size_type j, value_type c)const {
assert(i <= j and j <= size());
if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
return t_ret_type {0, j-i, 0};
}
size_type offset = 0;
size_type smaller = 0;
size_type greater = 0;
uint64_t mask = (1ULL) << (m_max_depth-1);
size_type node_size = m_size;
for (uint32_t k=0; k < m_max_depth; ++k) {
size_type ones_before_o = m_tree_rank(offset);
size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o;
size_type ones_before_j = m_tree_rank(offset + j) - ones_before_o;
size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
if (c & mask) { // search for a one at this level
offset += (node_size - ones_before_end);
node_size = ones_before_end;
smaller += j-i-ones_before_j+ones_before_i;
i = ones_before_i;
j = ones_before_j;
} else { // search for a zero at this level
node_size -= ones_before_end;
greater += ones_before_j-ones_before_i;
i -= ones_before_i;
j -= ones_before_j;
}
offset += m_size;
mask >>= 1;
}
return t_ret_type {i, smaller, greater};
};

//! How many symbols are lexicographic smaller than c in [0..i-1].
/*!
* \param i Exclusive right bound of the range.
* \param c Symbol c.
* \return A tuple containing:
* * rank(c,i)
* * #symbols smaller than c in [0..i-1]
* \par Precondition
* \f$ i \leq size() \f$
*/
template<class t_ret_type = std::tuple<size_type, size_type>>
t_ret_type lex_smaller_count(size_type i, value_type c) const {
assert(i <= size());
if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
return t_ret_type {0, i};
}
size_type offset = 0;
size_type result = 0;
uint64_t mask = (1ULL) << (m_max_depth-1);
size_type node_size = m_size;
for (uint32_t k=0; k < m_max_depth and i; ++k) {
size_type ones_before_o = m_tree_rank(offset);
size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o;
size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
if (c & mask) { // search for a one at this level
offset += (node_size - ones_before_end);
node_size = ones_before_end;
result += i - ones_before_i;
i = ones_before_i;
} else { // search for a zero at this level
node_size = (node_size - ones_before_end);
i -= ones_before_i;
}
offset += m_size;
mask >>= 1;
}
return t_ret_type {i, result};
}

//! range_search_2d searches points in the index interval [lb..rb] and value interval [vlb..vrb].
/*! \param lb Left bound of index interval (inclusive)
* \param rb Right bound of index interval (inclusive)
Expand Down
Loading

0 comments on commit 6ac2394

Please sign in to comment.