From 6fb6ac0390b83cc457be961819c717516572c977 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Fri, 29 Apr 2016 09:32:28 +1000 Subject: [PATCH 01/21] Added csa_alphabet_strategy which allows use of context lengths larger than one --- include/sdsl/csa_sada.hpp | 47 +++++++++---- include/sdsl/suffix_array_algorithm.hpp | 90 +++++++++++++++++++++++-- include/sdsl/suffix_array_helper.hpp | 3 +- 3 files changed, 122 insertions(+), 18 deletions(-) diff --git a/include/sdsl/csa_sada.hpp b/include/sdsl/csa_sada.hpp index 8b4637cb1..ec3a5ca3a 100644 --- a/include/sdsl/csa_sada.hpp +++ b/include/sdsl/csa_sada.hpp @@ -140,6 +140,7 @@ class csa_sada const typename alphabet_type::comp2char_type& comp2char = m_alphabet.comp2char; const typename alphabet_type::C_type& C = m_alphabet.C; const typename alphabet_type::sigma_type& sigma = m_alphabet.sigma; + const alphabet_type& alphabet = m_alphabet; const psi_type& psi = m_psi; const lf_type lf = lf_type(*this); const bwt_type bwt = bwt_type(*this); @@ -285,27 +286,21 @@ class csa_sada return t_dens; } - private: - - // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text. + // Calculates how many symbols cc are in the prefix [0..i-1] of the BWT of the original text. /* - * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$. - * \param c The symbol to count the occurrences in the prefix. - * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT. + * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$. + * \param cc The compactified symbol to count in the prefix. + * \returns The number of occurrences of the compactified symbol cc in the prefix [0..i-1]. * \par Time complexity * \f$ \Order{\log n t_{\Psi}} \f$ */ - size_type rank_bwt(size_type i, const char_type c)const + template + size_type rank_comp_bwt(size_type i, const t_char cc)const { - comp_char_type cc = char2comp[c]; - if (cc==0 and c!=0) // character is not in the text => return 0 - return 0; if (i == 0) return 0; assert(i <= size()); - size_type lower_b, upper_b; // lower_b inclusive, upper_b exclusive - const size_type sd = m_psi.get_sample_dens(); size_type lower_sb = (C[cc]+sd-1)/sd; // lower_sb inclusive size_type upper_sb = (C[cc+1]+sd-1)/sd; // upper_sb exclusive @@ -367,6 +362,30 @@ class csa_sada } } + private: + + // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text. + /* + * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$. + * \param c The symbol to count in the prefix. + * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT. + * \par Time complexity + * \f$ \Order{\log n t_{\Psi}} \f$ + */ + // replace const char_type c by const std::array& c + template + size_type rank_bwt(size_type i, const t_char c)const + { + auto cc = char2comp[c]; + if (cc==0 and c!=0) // character is not in the text => return 0 + return 0; + if (i == 0) + return 0; + return rank_comp_bwt(i, cc); + } + + + // Calculates the position of the i-th c in the BWT of the original text. /* * \param i The i-th occurrence. \f$i\in [1..rank_bwt(size(),c)]\f$. @@ -402,7 +421,9 @@ csa_sada text_buf(cache_file_name(key_trait::KEY_TEXT,config)); + alphabet_type tmp_alphabet(text_buf, n); m_alphabet.swap(tmp_alphabet); } diff --git a/include/sdsl/suffix_array_algorithm.hpp b/include/sdsl/suffix_array_algorithm.hpp index 02b4be40f..b849ef6e5 100644 --- a/include/sdsl/suffix_array_algorithm.hpp +++ b/include/sdsl/suffix_array_algorithm.hpp @@ -172,12 +172,12 @@ typename t_csa::size_type backward_search( ) { assert(l <= r); assert(r < csa.size()); - typename t_csa::size_type cc = csa.char2comp[c]; + auto cc = csa.char2comp[c]; if (cc == 0 and c > 0) { l_res = 1; r_res = 0; } else { - typename t_csa::size_type c_begin = csa.C[cc]; + auto c_begin = csa.C[cc]; if (l == 0 and r+1 == csa.size()) { l_res = c_begin; r_res = csa.C[cc+1] - 1; @@ -238,6 +238,89 @@ backward_search( return r+1-l; } + + +template +uint64_t backward_search( + const csa_sada>& csa, + uint64_t l, + uint64_t r, + t_pat_iter begin, + t_pat_iter end, + uint64_t& l_res, + uint64_t& r_res +) +{ + assert(l <= r); assert(r < csa.size()); + typedef typename std::remove_reference::type t_csa; + using multi_comp_char_type = typename t_csa::alphabet_type::multi_comp_char_type; + constexpr auto q = t_csa::alphabet_type::q; + + auto m = std::distance(begin, end); + if (static_cast(m) < q) { + t_pat_iter it = end; + while (begin < it and r+1-l > 0) { + --it; + backward_search(csa, l, r, (typename t_csa::char_type)*it, l, r); + } + l_res = l; + r_res = r; + return r+1-l; + } + + multi_comp_char_type x {0}; + t_pat_iter it = end; + size_t processed = 0; +// std::cout<<"simga="<(x+1)<& text_buf, int_vector_size_type len): + char2comp(this), comp2char(this), C(m_C), sigma(m_sigma), sigma_q(m_sigma_q), + sigma_q_1(m_sigma_q_1) + { + m_sigma = 0; + if (0 == len or 0 == text_buf.size()) + return; + assert(len <= text_buf.size()); + // initialize vectors + int_vector<64> D(257, 0); + bit_vector tmp_char(256, 0); + // count occurrences of each symbol +// std::cout<<"text="; + for (size_type i=0; i < len; ++i) { + ++D[text_buf[i]]; +// std::cout<<(char)text_buf[i]; + } +// std::cout<(m_sigma+1, 0, bits::hi(len)+1); + m_C.multi_C = int_vector<>(m_sigma_q+1, 0, bits::hi(len)+1); + + for (int i=(int)m_sigma; i > 0; --i) m_C.C[i] = D[i-1]; + m_C.C[0] = 0; + for (int i=1; i <= (int)m_sigma; ++i) m_C.C[i] = m_C.C[i] + m_C.C[i-1]; + assert(m_C.C[sigma]==len); + m_char = tmp_char; + util::init_support(m_char_rank, &m_char); + util::init_support(m_char_select, &m_char); + if (t_q == 1) { + m_C.multi_C = m_C.C; + } else if (t_q > 1) { + int_vector<64> multi_D(m_sigma_q+1, 0); + // count occurrences of each symbol + uint64_t x = 0; + for (size_type i=0; i::size_type size_type; - typedef char2comp_wrapper char2comp_type; - typedef comp2char_wrapper comp2char_type; + typedef char2comp_wrapper_int char2comp_type; + typedef comp2char_wrapper_int comp2char_type; typedef C_array_type C_type; typedef uint64_t sigma_type; typedef uint64_t char_type; @@ -411,12 +799,12 @@ class int_alphabet enum { int_width = 0 }; //! Helper class for the char2comp mapping - class char2comp_wrapper + class char2comp_wrapper_int { private: const int_alphabet* m_strat; public: - char2comp_wrapper(const int_alphabet* strat) : m_strat(strat) {} + char2comp_wrapper_int(const int_alphabet* strat) : m_strat(strat) {} comp_char_type operator[](char_type c) const { if (m_strat->m_char.size() > 0) { // if alphabet is not continuous @@ -433,12 +821,12 @@ class int_alphabet }; //! Helper class for the comp2char mapping - class comp2char_wrapper + class comp2char_wrapper_int { private: const int_alphabet* m_strat; public: - comp2char_wrapper(const int_alphabet* strat) : m_strat(strat) {} + comp2char_wrapper_int(const int_alphabet* strat) : m_strat(strat) {} char_type operator[](comp_char_type c) const { if (m_strat->m_char.size() > 0) { // if alphabet is not continuous @@ -477,6 +865,7 @@ class int_alphabet return ((--D.end())->first + 1) == D.size(); } } + public: const char2comp_type char2comp; From 22e62233d45c847832a68ff4566c159e430cc781 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Fri, 29 Apr 2016 15:56:09 +1000 Subject: [PATCH 03/21] Simplified rank_bwt --- include/sdsl/csa_sada.hpp | 113 +++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/include/sdsl/csa_sada.hpp b/include/sdsl/csa_sada.hpp index ec3a5ca3a..b0649f4dc 100644 --- a/include/sdsl/csa_sada.hpp +++ b/include/sdsl/csa_sada.hpp @@ -300,70 +300,69 @@ class csa_sada if (i == 0) return 0; assert(i <= size()); - size_type lower_b, upper_b; // lower_b inclusive, upper_b exclusive - const size_type sd = m_psi.get_sample_dens(); - size_type lower_sb = (C[cc]+sd-1)/sd; // lower_sb inclusive - size_type upper_sb = (C[cc+1]+sd-1)/sd; // upper_sb exclusive - while (lower_sb+1 < upper_sb) { - size_type mid = (lower_sb+upper_sb)/2; - if (m_psi.sample(mid) >= i) - upper_sb = mid; - else - lower_sb = mid; + const auto cc_begin = C[cc]; // begin of interval of context cc (inclusive) + const auto cc_end = C[cc+1]; // end of interval of context cc (exclusive) + const size_type sd = m_psi.get_sample_dens(); + size_type s_begin = (cc_begin+sd)/sd; // first sample after cc_begin + size_type s_end = (cc_end+sd-1)/sd; // first sample at or after cc_end + + if (s_begin == s_end) { + // Case (1): No sample inside [cc_begin, cc_end) + // => search in previous block (s_begin-1) + } else if (m_psi.sample(s_begin) >= i) { // now s_begin < s_end + // Case (2): Some samples inside [cc_begin, cc_end) + // and first sample already larger or equal to i + // => search in previous block (s_begin-1) + } else { // still s_begin < s_end + // Case (3): Some samples inside [cc_begin, cc_end) + // and first sample smaller than i + // => binary search for first sample >= i + s_begin = upper_bound(s_begin, s_end, i-1); + // => search in previous block (s_begin-1) } - - if (lower_sb == upper_sb) { // the interval was smaller than sd - lower_b = C[cc]; upper_b = C[cc+1]; - } else if (lower_sb > (C[cc]+sd-1)/sd) { // main case -// TODO: don't use get_inter_sampled_values if t_dens is really -// large - lower_b = lower_sb*sd; - if (0 == m_psi_buf.size()) { - upper_b = std::min(upper_sb*sd, C[cc+1]); - goto finish; - } - uint64_t* p = m_psi_buf.data(); - // extract the psi values between two samples - m_psi.get_inter_sampled_values(lower_sb, p); - p = m_psi_buf.data(); - uint64_t smpl = m_psi.sample(lower_sb); - // handle border cases - if (lower_b + m_psi.get_sample_dens() >= C[cc+1]) - m_psi_buf[ C[cc+1]-lower_b ] = size()-smpl; - else - m_psi_buf[ m_psi.get_sample_dens() ] = size()-smpl; - // search the result linear - while ((*p++)+smpl < i); - - return p-1-m_psi_buf.data() + lower_b - C[cc]; - } else { // lower_b == (m_C[cc]+sd-1)/sd and lower_sb < upper_sb - if (m_psi.sample(lower_sb) >= i) { - lower_b = C[cc]; - upper_b = lower_sb * sd + 1; - } else { - lower_b = lower_sb * sd; - upper_b = std::min(upper_sb*sd, C[cc+1]); - } - } -finish: - // binary search the interval [C[cc]..C[cc+1]-1] for the result -// size_type lower_b = m_C[cc], upper_b = m_C[cc+1]; // lower_b inclusive, upper_b exclusive - while (lower_b+1 < upper_b) { - size_type mid = (lower_b+upper_b)/2; - if (m_psi[mid] >= i) - upper_b = mid; - else - lower_b = mid; + s_begin -= 1; + uint64_t* p = m_psi_buf.data(); + // extract the psi values between two samples + m_psi.get_inter_sampled_values(s_begin, p); + p = m_psi_buf.data(); + uint64_t smpl = m_psi.sample(s_begin); + + size_t abs_decode_begin = s_begin*sd; + size_t skip = 0; + if (abs_decode_begin < cc_begin) { + skip = cc_begin - abs_decode_begin; } - if (lower_b > C[cc]) - return lower_b - C[cc] + 1; - else { // lower_b == m_C[cc] - return m_psi[lower_b] < i;// 1 if m_psi[lower_b]= i) + break; + ++res; } + return res; } private: + template + size_t upper_bound(size_t first, size_t last, V value) const + { + size_t mid; + size_t count, step; + count = last-first; + + while (count > 0) { + mid = first; + step = count / 2; + mid += step; + if (!(value < m_psi.sample(mid))) { + first = ++mid; + count -= step + 1; + } else count = step; + } + return first; + } + // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text. /* * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$. From 53452cc8ea46026ee98633c7645baa6410f38246 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Sat, 30 Apr 2016 11:28:38 +1000 Subject: [PATCH 04/21] added double_rank --- include/sdsl/csa_sada.hpp | 66 +++++++++++++++++++++++++ include/sdsl/suffix_array_algorithm.hpp | 7 ++- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/include/sdsl/csa_sada.hpp b/include/sdsl/csa_sada.hpp index b0649f4dc..f4865de96 100644 --- a/include/sdsl/csa_sada.hpp +++ b/include/sdsl/csa_sada.hpp @@ -342,6 +342,72 @@ class csa_sada return res; } + template + std::tuple double_rank_comp_bwt(size_type i, size_type j, const t_char cc)const + { + if (i == 0) + return std::make_tuple(0, rank_comp_bwt(j,cc)); + assert(i <= size()); + const auto cc_begin = C[cc]; // begin of interval of context cc (inclusive) + const auto cc_end = C[cc+1]; // end of interval of context cc (exclusive) + const size_type sd = m_psi.get_sample_dens(); + size_type s_begin = (cc_begin+sd)/sd; // first sample after cc_begin + size_type s_end = (cc_end+sd-1)/sd; // first sample at or after cc_end + bool answer_j = false; + + if (s_begin == s_end) { + // Case (1): No sample inside [cc_begin, cc_end) + // => search in previous block (s_begin-1) + answer_j = true; + } else if (m_psi.sample(s_begin) >= i) { // now s_begin < s_end + // Case (2): Some samples inside [cc_begin, cc_end) + // and first sample already larger or equal to i + // => search in previous block (s_begin-1) + answer_j = (m_psi.sample(s_begin) >= j); + } else { // still s_begin < s_end + // Case (3): Some samples inside [cc_begin, cc_end) + // and first sample smaller than i + // => binary search for first sample >= i + s_begin = upper_bound(s_begin, s_end, i-1); + // => search in previous block (s_begin-1) + answer_j = (s_begin == s_end) or m_psi.sample(s_begin>=j); + } + // TODO: add ALL ONES TRICK inside a block + s_begin -= 1; + uint64_t* p = m_psi_buf.data(); + // extract the psi values between two samples + m_psi.get_inter_sampled_values(s_begin, p); + p = m_psi_buf.data(); + uint64_t smpl = m_psi.sample(s_begin); + + size_t abs_decode_begin = s_begin*sd; + size_t skip = 0; + if (abs_decode_begin < cc_begin) { + skip = cc_begin - abs_decode_begin; + } + size_t res = abs_decode_begin + skip - cc_begin; + + auto it = p + skip; + for (; (res < cc_end - cc_begin) and it < m_psi_buf.data()+sd; ++it) { + if ((*it)+smpl >= i) { + break; + } + ++res; + } + + if (answer_j) { + size_t res2 = res; + for (; (res < cc_end - cc_begin) and it < m_psi_buf.data()+sd; ++it) { + if ((*it)+smpl >= j) { + break; + } + ++res2; + } + return std::make_tuple(res, res2); + } + return std::make_tuple(res, rank_comp_bwt(j, cc)); + } + private: template diff --git a/include/sdsl/suffix_array_algorithm.hpp b/include/sdsl/suffix_array_algorithm.hpp index b849ef6e5..1a1105eb9 100644 --- a/include/sdsl/suffix_array_algorithm.hpp +++ b/include/sdsl/suffix_array_algorithm.hpp @@ -309,8 +309,11 @@ uint64_t backward_search( r = csa.C[x+1] - 1; // std::cout<<"initial step ["<(x+1)<(lr); + r = c_begin + std::get<1>(lr)-1; +// l = c_begin + csa.rank_comp_bwt(l, x); // count c in bwt[0..l-1] +// r = c_begin + csa.rank_comp_bwt(r+1, x) - 1; // count c in bwt[0..r] } // std::cout<<"bw_search debug ["<> 32)) { // hi >= 32 + if ((t = tt >> 16)) { // hi >= 48 + return (tt = t >> 8) ? 56 + lt_hi[tt] : 48 + lt_hi[t]; + } else { // hi < 48 + return (t = tt >> 8) ? 40 + lt_hi[t] : 32 + lt_hi[tt]; + } + } else { // hi < 32 + if ((t = x >> 16)) { // hi >= 16 + return (tt = t >> 8) ? 24 + lt_hi[tt] : 16 + lt_hi[t]; + } else { // hi < 16 + return (tt = x >> 8) ? 8 + lt_hi[tt] : lt_hi[x]; + } + } +#endif + } //! Calculates the position of the rightmost 1-bit in the 64bit integer x if it exists /*! \param x 64 bit integer. @@ -379,32 +422,6 @@ inline uint32_t bits::_sel(uint64_t x, uint32_t i) return 0; } -// using built-in method or -// 64-bit version of 32-bit proposal of -// http://www-graphics.stanford.edu/~seander/bithacks.html -inline uint32_t bits::hi(uint64_t x) -{ -#ifdef __SSE4_2__ - if (x == 0) - return 0; - return 63 - __builtin_clzll(x); -#else - uint64_t t,tt; // temporaries - if ((tt = x >> 32)) { // hi >= 32 - if ((t = tt >> 16)) { // hi >= 48 - return (tt = t >> 8) ? 56 + lt_hi[tt] : 48 + lt_hi[t]; - } else { // hi < 48 - return (t = tt >> 8) ? 40 + lt_hi[t] : 32 + lt_hi[tt]; - } - } else { // hi < 32 - if ((t = x >> 16)) { // hi >= 16 - return (tt = t >> 8) ? 24 + lt_hi[tt] : 16 + lt_hi[t]; - } else { // hi < 16 - return (tt = x >> 8) ? 8 + lt_hi[tt] : lt_hi[x]; - } - } -#endif -} // details see: http://citeseer.ist.psu.edu/leiserson98using.html // or page 10, Knuth TAOCP Vol 4 F1A diff --git a/include/sdsl/hyb_sd_vector.hpp b/include/sdsl/hyb_sd_vector.hpp new file mode 100644 index 000000000..e1a55ebd3 --- /dev/null +++ b/include/sdsl/hyb_sd_vector.hpp @@ -0,0 +1,917 @@ +/* sdsl - succinct data structures library + Copyright (C) 2012-2014 Simon Gog + Copyright (C) 2015 Genome Research Ltd. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/ . +*/ +/*!\file sd_vector.hpp + \brief sd_vector.hpp contains the sdsl::sd_vector class, and + classes which support rank and select for sd_vector. + \author Simon Gog, Jouni Siren +*/ +#ifndef INCLUDED_SDSL_HYB_SD_VECTOR +#define INCLUDED_SDSL_HYB_SD_VECTOR + +#include "int_vector.hpp" +#include "sd_vector.hpp" +#include "util.hpp" +#include "iterators.hpp" + +//! Namespace for the succinct data structure library +namespace sdsl +{ + +template +std::string print_vec(t_itr beg, t_itr end) +{ + std::string str = "["; + auto itr = beg; + while (itr != (end - 1)) { + str += std::to_string(*itr) + " "; + ++itr; + } + str += std::to_string(*itr) + "]"; + return str; +} + +inline uint64_t next0(const uint64_t* word, uint64_t idx) +{ + word += (idx >> 6); + auto masked_inverse_word = ~(*word | sdsl::bits::lo_set[(idx & 0x3F) + 1]); + if (masked_inverse_word) { + return (idx & ~((size_t)0x3F)) + sdsl::bits::lo(masked_inverse_word); + } + idx = (idx & ~((size_t)0x3F)) + 64; + ++word; + while (*word == 0xFFFFFFFFFFFFFFFFULL) { + idx += 64; + ++word; + } + return idx + sdsl::bits::lo(~(*word)); +} + +/*! + * \param word Beginning of bit_vector (represented as sequence of uint64_t words) + * \param idx Initial scanning position (in bits) + * \param i i + * \return The number of set bits up to position i (exlusive) + */ +template +inline uint64_t cnt(const uint64_t* word, uint64_t idx, uint64_t i) +{ +// std::cout<<"cnt("<> 6); + auto offset = idx & 0x3F; +// std::cout<<"offset="<> offset; + uint64_t pre_considered = 0; + uint64_t considered = 64 - offset; +// std::cout<<"considered="<= t_block_size) { + return t_block_size; + } + pre_considered = considered; + considered += 64; + w = *(++word); + } + +//std::cout<<"considered="<> 6); + auto offset = idx & 0x3F; + uint64_t w = (~(*word)) >> offset; + uint64_t considered = 64 - offset; + uint64_t res = 0; + uint64_t cnt = 0; + uint64_t word_cnt = sdsl::bits::cnt(w); + + while (cnt + word_cnt < i) { + cnt += word_cnt; + if (considered > max_considered) { +// std::cout << "LOOP considered = " << considered << " max_considered = " << max_considered << std::endl; + return std::numeric_limits::max(); + } + res = considered; + considered += 64; + w = (~(*(++word))); + word_cnt = sdsl::bits::cnt(w); + } + // cnt < i and cnt+word_cnt >= i + // add select (i-cnt) to res + res += sdsl::bits::sel(w, i - cnt); + if (res > max_considered) { +// std::cout << "res = " << res << " i = " << i << std::endl; +// std::cout << "IF considered = " << considered << " max_considered = " << max_considered << std::endl; + return std::numeric_limits::max(); + } + return res; +} + +template +class hyb_sd_block_support_bv +{ + public: + typedef bit_vector::size_type size_type; + typedef size_type value_type; + + public: + static size_type estimate_size(size_type u) + { + return u; + } + static size_type + serialize(sdsl::bit_vector& bv, size_type offset, sdsl::int_vector<64>& data, size_type) + { +// std::cout<<"write BV of size "<&, size_type, size_type i, size_type) + { + return i; + } +}; + +template +class hyb_sd_block_support_ef +{ + public: + typedef bit_vector::size_type size_type; + typedef size_type value_type; + + private: + // TODO factor out calculation of logu and logm + + public: + static size_type estimate_size(size_type u) + { + uint8_t logu = sdsl::bits::hi(u) + 1; + uint8_t logm = sdsl::bits::hi(t_block_size) + 1; // TODO constexpr for hi? + if (logm == logu) + logm--; + size_type width_low = logu - logm; + size_type size_in_bits = width_low * t_block_size + (1ULL << logm) + t_block_size + 1; + return size_in_bits; + } + static size_type + serialize(sdsl::bit_vector& bv, size_type offset, sdsl::int_vector<64>& data, size_type u) + { + size_type written_bits = 0; + uint8_t logu = sdsl::bits::hi(u) + 1; + uint8_t logm = sdsl::bits::hi(t_block_size) + 1; + if (logm == logu) + logm--; + size_type width_low = logu - logm; +//std::cout<<"offst="< +class select_support_hyb_sd; + +template +class rank_support_hyb_sd; + +template +class hyb_sd_vector +{ + public: + typedef bit_vector::size_type size_type; + typedef bool value_type; + typedef bit_vector::difference_type difference_type; + typedef random_access_const_iterator iterator; + typedef iterator const_iterator; + typedef bv_tag index_category; + + // typedef rank_support_hyb_sd<0, hyb_sd_vector> rank_0_type; + typedef rank_support_hyb_sd<1, hyb_sd_vector> rank_1_type; + // typedef select_support_hyb_sd<0, hyb_sd_vector> select_0_type; + typedef select_support_hyb_sd<1, hyb_sd_vector> select_1_type; + + private: + sd_vector<> m_top; + sd_vector<>::select_1_type m_top_sel; + sd_vector<>::rank_1_type m_top_rank; + sdsl::bit_vector m_bottom; + sdsl::int_vector<> m_block_start; + size_type m_size = 0; + size_type m_num_ones = 0; + + public: + static constexpr uint16_t block_size = t_block_size; + + private: + enum class hyb_sd_blocktype + { + EF, + BV, + FULL + }; + + std::pair + determine_block_type(size_t u) const + { + if (u == t_block_size) + return { hyb_sd_blocktype::FULL, 0 }; + size_type ef_bits = hyb_sd_block_support_ef::estimate_size(u); + size_type bv_bits = hyb_sd_block_support_bv::estimate_size(u); + if (bv_bits < ef_bits) + return { hyb_sd_blocktype::BV, bv_bits }; + return { hyb_sd_blocktype::EF, ef_bits }; + } + + size_type compress_block(size_type offset, sdsl::int_vector<64>& data, size_t u) + { + size_type written_bits = 0; + auto bt = determine_block_type(u); + auto type = bt.first; + auto size_in_bits = bt.second; + if (m_bottom.size() < offset + size_in_bits) { + m_bottom.resize(m_bottom.size() * 2 + size_in_bits); + } + + for (size_t i=1; i + hyb_sd_vector(const t_itr begin, const t_itr end, size_type bv_size = 0) + { + if (begin == end and bv_size==0) { + return; + } + if (!is_sorted(begin, end)) { + throw std::runtime_error("hyb_sd_vector: source list is not sorted."); + } + m_size = bv_size; + if (bv_size == 0) + m_size = *(end - 1) + 1; + m_num_ones = std::distance(begin, end); + if (m_num_ones==0) { + return; + } + size_type num_full_blocks = m_num_ones / t_block_size; + size_type num_blocks = num_full_blocks; + size_type num_leftover = m_num_ones % t_block_size; + size_type num_dummy = (t_block_size - num_leftover) % t_block_size; + bool has_leftover_block = num_leftover != 0; + if (has_leftover_block) { + num_blocks++; + } + + // (1) fill the top level + std::vector top_lvl; + auto itr = begin; + while (itr < end) { + top_lvl.push_back(*itr); + itr += t_block_size; + } + // terminate the top level so top[i+1] - top[i] always works + top_lvl.push_back(std::max(bv_size, *(end - 1)+1) + num_dummy); + + // (2) bottom level + m_block_start.resize(num_blocks + 1); + itr = begin; + size_type value_offset = 0; + size_type written_bits = 0; + sdsl::int_vector<64> tmp_data(t_block_size); + for (size_type i = 0; i < num_blocks; i++) { + m_block_start[i] = written_bits; + // (2a) compute block data + value_offset = top_lvl[i]; + for (size_type j = 0; j < t_block_size; j++) { + if (itr == end) { + tmp_data[j] = std::max(m_size-value_offset, tmp_data[j - 1] + 1); + } else { + tmp_data[j] = *itr - value_offset; + ++itr; + } + } + + // (2b) compress block + size_type block_universe = top_lvl[i + 1] - top_lvl[i]; + // std::cout << "compress block " << i << std::endl; + auto wb = compress_block(m_block_start[i], tmp_data, block_universe); + written_bits += wb; + } + m_block_start[num_blocks] = written_bits; + m_bottom.resize(written_bits); + + // (3) encode the top level + m_top = decltype(m_top)(top_lvl.begin(), top_lvl.end()); + m_top_sel = decltype(m_top_sel)(&m_top); + m_top_rank = decltype(m_top_rank)(&m_top); + + // (4) bit compress pointers + sdsl::util::bit_compress(m_block_start); + } + + + value_type operator[](size_type i)const + { + return rank_1(i+1) - rank_1(i); + } + + //! Accessing the i-th element of the original bit_vector + size_type select_1(size_type i) const + { + i = i - 1; + auto block_id = i / t_block_size; + auto in_block_offset = i % t_block_size; + auto top_value = m_top_sel(block_id + 1); + size_type res = top_value; + + if (in_block_offset == 0) + return res; + + auto u = m_top_sel(block_id + 2) - top_value; + auto bt = determine_block_type(u); + auto block_type = bt.first; + size_type block_offset = m_block_start[block_id]; + switch (block_type) { + case hyb_sd_blocktype::BV: + // std::cout << "BV" << std::endl; + res += hyb_sd_block_support_bv::select_1(m_bottom, block_offset, in_block_offset, u); + break; + case hyb_sd_blocktype::EF: + // std::cout << "EF" << std::endl; + res += hyb_sd_block_support_ef::select_1(m_bottom, block_offset, in_block_offset, u); + break; + case hyb_sd_blocktype::FULL: + // std::cout << "FULL" << std::endl; + res += hyb_sd_block_support_full::select_1(m_bottom, block_offset, in_block_offset, u); + break; + } + return res; + } + + size_type rank_1(size_type i) const + { + if (i > m_size or m_num_ones == 0) { + return m_num_ones; + } + auto block_id = m_top_rank(i); + if (block_id == 0) + return 0; + block_id -= 1; + size_type res = block_id * t_block_size; + auto top_value = m_top_sel(block_id + 1); + size_type in_block_i = i; + in_block_i -= top_value; + if (in_block_i == 0) + return res; + + auto u = m_top_sel(block_id + 2) - top_value; + auto bt = determine_block_type(u); + auto block_type = bt.first; + size_type block_offset = m_block_start[block_id]; + switch (block_type) { + case hyb_sd_blocktype::BV: +// std::cout << "BV" << std::endl; + res += hyb_sd_block_support_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + break; + case hyb_sd_blocktype::EF: +// std::cout << "EF" << std::endl; + res += hyb_sd_block_support_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + break; + case hyb_sd_blocktype::FULL: +// std::cout << "FULL" << std::endl; + res += hyb_sd_block_support_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + break; + } + + return res; + } + + //! Get the integer value of the binary string of length len starting at position idx. + uint64_t get_int(size_type idx, const uint8_t len = 64) const + { + uint64_t x = 0ULL; + for (size_t i=0; i((*this)[idx+i])) << i; + } + return x; + } + + //! Returns the size of the original bit vector. + size_type size() const + { + return m_size; + } + + //! Serializes the data structure into the given ostream + size_type serialize(std::ostream& out, structure_tree_node* v = nullptr, std::string name = "") const + { + structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); + size_type written_bytes = 0; + written_bytes += write_member(m_size, out, child, "size"); + written_bytes += write_member(m_num_ones, out, child, "num_ones"); + written_bytes += m_top.serialize(out, child, "top"); + written_bytes += m_top_sel.serialize(out, child, "top_sel"); + written_bytes += m_top_rank.serialize(out, child, "top_rank"); + written_bytes += m_bottom.serialize(out, child, "bottom"); + written_bytes += m_block_start.serialize(out, child, "block_start"); + structure_tree::add_size(child, written_bytes); + return written_bytes; + } + + //! Loads the data structure from the given istream. + void load(std::istream& in) + { + read_member(m_size, in); + read_member(m_num_ones, in); + m_top.load(in); + m_top_sel.load(in); + m_top_sel.set_vector(&m_top); + m_top_rank.load(in); + m_top_rank.set_vector(&m_top); + m_bottom.load(in); + m_block_start.load(in); + } + + void swap(hyb_sd_vector& v) + { + std::swap(m_size, v.m_size); + std::swap(m_num_ones, v.m_num_ones); + m_top.swap(v.m_top); + util::swap_support(m_top_sel, v.m_top_sel, &m_top, &(v.m_top)); + util::swap_support(m_top_rank, v.m_top_rank, &m_top, &(v.m_top)); + m_bottom.swap(v.m_bottom); + m_block_start.swap(v.m_block_start); + } + + iterator begin() const + { + return iterator(this, 0); + } + + iterator end() const + { + return iterator(this, size()); + } +}; + +//! Select data structure for hyb_sd_vector +template > +class select_support_hyb_sd +{ + public: + typedef typename hyb_bv_type::size_type size_type; + typedef hyb_bv_type bit_vector_type; + enum { bit_pat = t_b }; + enum { bit_pat_len = (uint8_t)1 }; + static constexpr uint16_t block_size = hyb_bv_type::block_size; + + private: + const hyb_bv_type* m_v; + + public: + explicit select_support_hyb_sd(const hyb_bv_type* v = nullptr) + { + set_vector(v); + } + + size_type select(size_type i) const + { + return m_v->select_1(i); + } + + size_type operator()(size_type i) const + { + return select(i); + } + + size_type size() const + { + return m_v->size(); + } + + void set_vector(const hyb_bv_type* v = nullptr) + { + m_v = v; + } + + select_support_hyb_sd& operator=(const select_support_hyb_sd& ss) + { + if (this != &ss) { + set_vector(ss.m_v); + } + return *this; + } + + void swap(select_support_hyb_sd&) {} + + void load(std::istream&, const hyb_bv_type* v = nullptr) + { + set_vector(v); + } + + size_type serialize(std::ostream& out, structure_tree_node* v = nullptr, std::string name = "") const + { + return serialize_empty_object(out, v, name, this); + } +}; + +//! Rank data structure for hyb_sd_vector +template > +class rank_support_hyb_sd +{ + public: + typedef typename hyb_bv_type::size_type size_type; + typedef hyb_bv_type bit_vector_type; + enum { bit_pat = t_b }; + enum { bit_pat_len = (uint8_t)1 }; + static constexpr uint16_t block_size = hyb_bv_type::block_size; + + private: + const hyb_bv_type* m_v; + + public: + explicit rank_support_hyb_sd(const hyb_bv_type* v = nullptr) + { + set_vector(v); + } + + size_type rank(size_type i) const + { + return m_v->rank_1(i); + } + + size_type operator()(size_type i) const + { + return rank(i); + } + + size_type size() const + { + return m_v->size(); + } + + void set_vector(const hyb_bv_type* v = nullptr) + { + m_v = v; + } + + rank_support_hyb_sd& operator=(const rank_support_hyb_sd& ss) + { + if (this != &ss) { + set_vector(ss.m_v); + } + return *this; + } + + void swap(rank_support_hyb_sd&) {} + + void load(std::istream&, const hyb_bv_type* v = nullptr) + { + set_vector(v); + } + + size_type serialize(std::ostream& out, structure_tree_node* v = nullptr, std::string name = "") const + { + return serialize_empty_object(out, v, name, this); + } +}; + +} // end namespace +#endif diff --git a/lib/bits.cpp b/lib/bits.cpp index 19bdfdfc2..e2f03056c 100644 --- a/lib/bits.cpp +++ b/lib/bits.cpp @@ -54,7 +54,7 @@ const uint8_t bits::lt_cnt[] = { 5, 6, 6, 7, 6, 7, 7, 8 }; - +constexpr uint32_t bits::lt_hi[]; const uint32_t bits::lt_deBruijn_to_idx[] = { 0, 1, 2, 7, 3,13, 8,19, @@ -67,25 +67,6 @@ const uint32_t bits::lt_deBruijn_to_idx[] = { 61,22,43,51,60,42,59,58 }; -const uint32_t bits::lt_hi[] = { - 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -}; - const uint64_t bits::lo_set[] = { 0x0000000000000000ULL, 0x0000000000000001ULL, From 4ebfdb54f5375b619d60376fecaa08dbcaad57d4 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Wed, 4 May 2016 12:57:45 +1000 Subject: [PATCH 06/21] csa_sada2 realizes the CSA concept now --- include/sdsl/bits.hpp | 1 + include/sdsl/construct_sa.hpp | 1 + include/sdsl/csa_sada.hpp | 69 +++- include/sdsl/csa_sada2.hpp | 595 +++++++++++++++++++++++++++++++++ include/sdsl/enc_vector.hpp | 26 ++ include/sdsl/hyb_sd_vector.hpp | 58 +++- include/sdsl/suffix_arrays.hpp | 1 + 7 files changed, 730 insertions(+), 21 deletions(-) create mode 100644 include/sdsl/csa_sada2.hpp diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index 1bfb65b72..132661c16 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -154,6 +154,7 @@ struct bits { return (tt = x >> 8) ? 8 + lt_hi[tt] : lt_hi[x]; } } + return 0; #endif } diff --git a/include/sdsl/construct_sa.hpp b/include/sdsl/construct_sa.hpp index cacb8626d..22eaaf7ab 100644 --- a/include/sdsl/construct_sa.hpp +++ b/include/sdsl/construct_sa.hpp @@ -24,6 +24,7 @@ #include "config.hpp" #include "int_vector.hpp" +#include "bits.hpp" #include "divsufsort.h" #include "divsufsort64.h" diff --git a/include/sdsl/csa_sada.hpp b/include/sdsl/csa_sada.hpp index f4865de96..c242f0683 100644 --- a/include/sdsl/csa_sada.hpp +++ b/include/sdsl/csa_sada.hpp @@ -22,6 +22,7 @@ #define INCLUDED_SDSL_CSA_SADA #include "enc_vector.hpp" +#include "enc_vector2.hpp" #include "int_vector.hpp" #include "iterators.hpp" #include "suffix_array_helper.hpp" @@ -321,10 +322,6 @@ class csa_sada // => search in previous block (s_begin-1) } s_begin -= 1; - uint64_t* p = m_psi_buf.data(); - // extract the psi values between two samples - m_psi.get_inter_sampled_values(s_begin, p); - p = m_psi_buf.data(); uint64_t smpl = m_psi.sample(s_begin); size_t abs_decode_begin = s_begin*sd; @@ -334,6 +331,15 @@ class csa_sada } size_t res = abs_decode_begin + skip - cc_begin; + if ((s_begin+1)*sd < m_psi.size() and skip == 0 and smpl+sd == m_psi.sample(s_begin+1)) { + return res + (i - smpl); + } + + uint64_t* p = m_psi_buf.data(); + // extract the psi values between two samples + m_psi.get_inter_sampled_values(s_begin, p); + p = m_psi_buf.data(); + for (auto it = p + skip; (res < cc_end - cc_begin) and it < m_psi_buf.data()+sd; ++it) { if ((*it)+smpl >= i) break; @@ -370,14 +376,9 @@ class csa_sada // => binary search for first sample >= i s_begin = upper_bound(s_begin, s_end, i-1); // => search in previous block (s_begin-1) - answer_j = (s_begin == s_end) or m_psi.sample(s_begin>=j); + answer_j = (s_begin == s_end) or (m_psi.sample(s_begin) >=j); } - // TODO: add ALL ONES TRICK inside a block s_begin -= 1; - uint64_t* p = m_psi_buf.data(); - // extract the psi values between two samples - m_psi.get_inter_sampled_values(s_begin, p); - p = m_psi_buf.data(); uint64_t smpl = m_psi.sample(s_begin); size_t abs_decode_begin = s_begin*sd; @@ -387,6 +388,20 @@ class csa_sada } size_t res = abs_decode_begin + skip - cc_begin; + bool uniform_block = (s_begin+1)*sd < m_psi.size() and skip == 0 and smpl+sd == m_psi.sample(s_begin+1); + if (uniform_block) { + if (answer_j) { + return std::make_tuple(res + (i - smpl), res + (j - smpl)); + } else { + return std::make_tuple(res + (i - smpl), rank_comp_bwt(j, cc)); + } + } + + uint64_t* p = m_psi_buf.data(); + // extract the psi values between two samples + m_psi.get_inter_sampled_values(s_begin, p); + p = m_psi_buf.data(); + auto it = p + skip; for (; (res < cc_end - cc_begin) and it < m_psi_buf.data()+sd; ++it) { if ((*it)+smpl >= i) { @@ -394,10 +409,9 @@ class csa_sada } ++res; } - if (answer_j) { size_t res2 = res; - for (; (res < cc_end - cc_begin) and it < m_psi_buf.data()+sd; ++it) { + for (; (res2 < cc_end - cc_begin) and it < m_psi_buf.data()+sd; ++it) { if ((*it)+smpl >= j) { break; } @@ -514,6 +528,37 @@ csa_sada psi_buf(cache_file_name(conf::KEY_PSI, config)); t_enc_vec tmp_psi(psi_buf); m_psi.swap(tmp_psi); + /* + enc_vector m_psi_check(psi_buf); + if ( m_psi_check.size() != m_psi.size() ){ + std::cout<<"m_psi.size()="< buf1 = std::vector(enc_vector_type::sample_dens+1); + std::vector buf2 = std::vector(enc_vector_type::sample_dens+1); + + std::cout<<"m_psi.size()="< +#include +#include +#include // for strlen +#include +#include + + + +namespace sdsl +{ + +template +class uef_psi_support +{ + public: + typedef typename bit_vector::size_type size_type; + typedef size_type value_type; + typedef typename t_csa::alphabet_type alphabet_type; + typedef typename alphabet_type::comp_char_type comp_char_type; + typedef typename alphabet_type::C_type C_type; + typedef random_access_const_iterator iterator; + typedef iterator const_iterator; + typedef const value_type reference; + typedef const value_type const_reference; + typedef const value_type* const_pointer; + typedef ptrdiff_t difference_type; + typedef csa_member_tag category; + typedef int_alphabet_tag alphabet_category; + + private: + std::vector m_inc_seq; + std::vector m_inc_seq_rank; + std::vector m_inc_seq_sel; + const t_csa* m_csa; + + void set_vector() + { + for (size_t i=0; i& psi_buf, const t_csa* csa) + { +// std::cout<<"Hello!!!!"<C; +// std::cout<<"C.size()="< v(C[i+1]-C[i]); + //bit_vector bv(size(), 0); + for (size_t j=C[i]; jC.begin(), m_csa->C.end(),i) - m_csa->C.begin() - 1; + size_t cum_sum = m_csa->C[cc]; + /* if (50929==i){ + std::cout<sigma="<<(size_t)m_csa->sigma<C[cc+1]-1<<"] of size "<C[cc+1]-cum_sum<, // Vector type used to store the Psi-function + uint32_t t_dens = 32, // Sample density for suffix array (SA) values + uint32_t t_inv_dens = 64, // Sample density for inverse suffix array (ISA) values + class t_sa_sample_strat = sa_order_sa_sampling<>,// Policy class for the SA sampling. + class t_isa_sample_strat= isa_sampling<>, // Policy class for ISA sampling. + class t_alphabet_strat = byte_alphabet // Policy class for the representation of the alphabet. + > +class csa_sada2 +{ + static_assert(t_dens > 0, + "Second template argument has to be greater then 0."); + static_assert(t_inv_dens > 0, + "Third template argument has to be greater then 0."); + static_assert(std::is_same::type, sa_sampling_tag>::value, + "Forth template argument has to be a suffix array sampling strategy."); + static_assert(std::is_same::type, isa_sampling_tag>::value, + "Fifth template argument has to be a inverse suffix array sampling strategy."); + static_assert(is_alphabet::value, + "Sixth template argument has to be a alphabet strategy."); + + friend class bwt_of_csa_psi; + public: + enum { sa_sample_dens = t_dens, + isa_sample_dens = t_inv_dens + }; + + typedef uint64_t value_type; + typedef random_access_const_iterator const_iterator; + typedef const_iterator iterator; + typedef const value_type const_reference; + typedef const_reference reference; + typedef const_reference* pointer; + typedef const pointer const_pointer; + typedef int_vector<>::size_type size_type; + typedef size_type csa_size_type; + typedef ptrdiff_t difference_type; + typedef traverse_csa_psi lf_type; + typedef bwt_of_csa_psi bwt_type; + typedef isa_of_csa_psi isa_type; + typedef text_of_csa text_type; + typedef first_row_of_csa first_row_type; + typedef typename t_sa_sample_strat::template type sa_sample_type; + typedef typename t_isa_sample_strat::template type isa_sample_type; + typedef t_alphabet_strat alphabet_type; + typedef typename alphabet_type::alphabet_category alphabet_category; + typedef typename alphabet_type::comp_char_type comp_char_type; + typedef typename alphabet_type::char_type char_type; // Note: This is the char type of the CSA not the WT! + typedef typename alphabet_type::string_type string_type; + typedef csa_sada2 csa_type; + + typedef csa_tag index_category; + typedef psi_tag extract_category; + typedef uef_psi_support psi_type; + + friend class traverse_csa_psi; + friend class traverse_csa_psi; + + private: + psi_type m_psi_support; // psi function + sa_sample_type m_sa_sample; // suffix array samples + isa_sample_type m_isa_sample; // inverse suffix array samples + alphabet_type m_alphabet; // alphabet component + + void copy(const csa_sada2& csa) + { + m_psi_support = csa.m_psi_support; + m_psi_support.set_vector(this); + m_sa_sample = csa.m_sa_sample; + m_isa_sample = csa.m_isa_sample; + m_isa_sample.set_vector(&m_sa_sample); + m_alphabet = csa.m_alphabet; + }; + + public: + const typename alphabet_type::char2comp_type& char2comp = m_alphabet.char2comp; + const typename alphabet_type::comp2char_type& comp2char = m_alphabet.comp2char; + const typename alphabet_type::C_type& C = m_alphabet.C; + const typename alphabet_type::sigma_type& sigma = m_alphabet.sigma; + const alphabet_type& alphabet = m_alphabet; + const psi_type& psi = m_psi_support; + const lf_type lf = lf_type(*this); + const bwt_type bwt = bwt_type(*this); + const isa_type isa = isa_type(*this); + const bwt_type L = bwt_type(*this); + const first_row_type F = first_row_type(*this); + const text_type text = text_type(*this); + const sa_sample_type& sa_sample = m_sa_sample; + const isa_sample_type& isa_sample = m_isa_sample; + + + //! Default Constructor + csa_sada2() { } + //! Default Destructor + ~csa_sada2() { } + + //! Copy constructor + csa_sada2(const csa_sada2& csa) + { + copy(csa); + } + + //! Move constructor + csa_sada2(csa_sada2&& csa) + { + *this = std::move(csa); + } + + csa_sada2(cache_config& config); + + //! Number of elements in the \f$\CSA\f$. + /*! Required for the Container Concept of the STL. + * \sa max_size, empty + * \par Time complexity + * \f$ \Order{1} \f$ + */ + size_type size()const + { + return C[C.size()-1]; + } + + //! Returns the largest size that csa_sada2 can ever have. + /*! Required for the Container Concept of the STL. + * \sa size + */ + static size_type max_size() + { + return int_vector<>::max_size(); + } + + //! Returns if the data strucutre is empty. + /*! Required for the Container Concept of the STL.A + * \sa size + */ + bool empty()const + { + return 0==size(); + } + + //! Swap method for csa_sada2 + /*! The swap method can be defined in terms of assignment. + This requires three assignments, each of which, for a container type, is linear + in the container's size. In a sense, then, a.swap(b) is redundant. + This implementation guaranties a run-time complexity that is constant rather than linear. + \param csa csa_sada2 to swap. + + Required for the Assignable Conecpt of the STL. + */ + void swap(csa_sada2& csa) + { + if (this != &csa) { + util::swap_support(m_psi_support, csa.m_psi_support, this, &csa); + m_sa_sample.swap(csa.m_sa_sample); + util::swap_support(m_isa_sample, csa.m_isa_sample, &m_sa_sample, &(csa.m_sa_sample)); + m_alphabet.swap(csa.m_alphabet); + } + } + + + //! Returns a const_iterator to the first element. + /*! Required for the STL Container Concept. + * \sa end + */ + const_iterator begin()const + { + return const_iterator(this, 0); + } + + //! Returns a const_iterator to the element after the last element. + /*! Required for the STL Container Concept. + * \sa begin. + */ + const_iterator end()const + { + return const_iterator(this, size()); + } + + //! []-operator + /*! \param i Index of the value. \f$ i \in [0..size()-1]\f$. + * Required for the STL Random Access Container Concept. + * \par Time complexity + * \f$ \Order{s_{SA}\cdot t_{\Psi}} \f$, where every \f$s_{SA}\f$th suffix array entry is sampled and \f$t_{\Psi}\f$ + * is the access time for an element in the \f$\Psi\f$-function. + */ + value_type operator[](size_type i)const + { + size_type off = 0; + while (!m_sa_sample.is_sampled(i)) { // while i mod t_dens != 0 (SA[i] is not sampled) + i = psi[i]; // go to the position where SA[i]+1 is located + ++off; // add 1 to the offset + } + value_type result = m_sa_sample[i]; + if (result < off) { + return psi.size()-(off-result); + } else + return result-off; + } + + + //! Assignment Copy Operator. + /*! + * Required for the Assignable Concept of the STL. + */ + csa_sada2& operator=(const csa_sada2& csa) + { + if (this != &csa) { + copy(csa); + } + return *this; + } + + //! Assignment Move Operator. + /*! + * Required for the Assignable Concept of the STL. + */ + csa_sada2& operator=(csa_sada2&& csa) + { + if (this != &csa) { + m_psi_support = std::move(csa.m_psi_support); + m_psi_support.set_vector(this); + m_sa_sample = std::move(csa.m_sa_sample); + m_isa_sample = std::move(csa.m_isa_sample); + m_alphabet = std::move(csa.m_alphabet); + } + return *this; + } + + //! Serialize to a stream. + /*! \param out Outstream to write the data structure. + * \return The number of written bytes. + */ + size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const + { + structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); + size_type written_bytes = 0; + written_bytes += m_psi_support.serialize(out, child, "psi"); + written_bytes += m_sa_sample.serialize(out, child, "sa_samples"); + written_bytes += m_isa_sample.serialize(out, child, "isa_samples"); + written_bytes += m_alphabet.serialize(out, child, "alphabet"); + structure_tree::add_size(child, written_bytes); + return written_bytes; + } + + //! Load from a stream. + /*! \param in Input stream to load the data structure from. + */ + void load(std::istream& in) + { + m_psi_support.load(in); + m_psi_support.set_vector(this); + m_sa_sample.load(in); + m_isa_sample.load(in, &m_sa_sample); + m_alphabet.load(in); + } + + // Calculates how many symbols cc are in the prefix [0..i-1] of the BWT of the original text. + /* + * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$. + * \param cc The compactified symbol to count in the prefix. + * \returns The number of occurrences of the compactified symbol cc in the prefix [0..i-1]. + * \par Time complexity + * \f$ \Order{\log n t_{\Psi}} \f$ + */ + template + size_type rank_comp_bwt(size_type i, const t_char cc)const + { + return m_psi_support.rank(i, cc); + } + + template + std::tuple double_rank_comp_bwt(size_type i, size_type j, const t_char cc)const + { +// return m_psi_support.rank({i, j}, cc); + return {rank_comp_bwt(i, cc), rank_comp_bwt(j, cc)}; + } + + private: + + // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text. + /* + * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$. + * \param c The symbol to count in the prefix. + * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT. + * \par Time complexity + * \f$ \Order{\log n t_{\Psi}} \f$ + */ + // replace const char_type c by const std::array& c + template + size_type rank_bwt(size_type i, const t_char c)const + { + auto cc = char2comp[c]; + if (cc==0 and c!=0) // character is not in the text => return 0 + return 0; + if (i == 0) + return 0; + return rank_comp_bwt(i, cc); + } + + + + // Calculates the position of the i-th c in the BWT of the original text. + /* + * \param i The i-th occurrence. \f$i\in [1..rank_bwt(size(),c)]\f$. + * \param c Symbol c. + * \returns The position of the i-th c in the BWT or size() if c does occur less then i times. + * \par Time complexity + * \f$ \Order{t_{\Psi}} \f$ + */ + size_type select_bwt(size_type i, const char_type c)const + { + assert(i > 0); + comp_char_type cc = char2comp[c]; + if (cc==0 and c!=0) // character is not in the text => return 0 + return size(); + return m_psi_support.select(i, cc); + } +}; + +// == template functions == + +template +csa_sada2::csa_sada2(cache_config& config) +{ + if (!cache_file_exists(key_trait::KEY_BWT, config)) { + return; + } + int_vector_buffer bwt_buf(cache_file_name(key_trait::KEY_BWT,config)); + size_type n = bwt_buf.size(); + { + auto event = memory_monitor::event("construct csa-alpbabet"); +// alphabet_type tmp_alphabet(bwt_buf, n); // TODO: maybe it is possible to use _buf_buf again for multibyte!! + int_vector_buffer text_buf(cache_file_name(key_trait::KEY_TEXT,config)); + alphabet_type tmp_alphabet(text_buf, n); + m_alphabet.swap(tmp_alphabet); + } + + int_vector<> cnt_chr(sigma, 0, bits::hi(n)+1); + for (typename alphabet_type::sigma_type i=0; i < sigma; ++i) { + cnt_chr[i] = C[i]; + } + // calculate psi + { + auto event = memory_monitor::event("construct PSI"); + int_vector<> psi(n, 0, bits::hi(n)+1); + for (size_type i=0; i < n; ++i) { + psi[ cnt_chr[ char2comp[bwt_buf[i]] ]++ ] = i; + } + std::string psi_file = cache_file_name(conf::KEY_PSI, config); + if (!store_to_cache(psi, conf::KEY_PSI, config)) { + return; + } + } + { + auto event = memory_monitor::event("encode PSI"); + int_vector_buffer<> psi_buf(cache_file_name(conf::KEY_PSI, config)); + m_psi_support = psi_type(psi_buf, this); + } + { + auto event = memory_monitor::event("sample SA"); + sa_sample_type tmp_sa_sample(config); + m_sa_sample.swap(tmp_sa_sample); + } + { + auto event = memory_monitor::event("sample ISA"); + isa_sample_type isa_s(config, &m_sa_sample); + util::swap_support(m_isa_sample, isa_s, &m_sa_sample, (const sa_sample_type*)nullptr); + } +} + +} // end namespace sdsl +#endif diff --git a/include/sdsl/enc_vector.hpp b/include/sdsl/enc_vector.hpp index 45a60749b..dafc6002d 100644 --- a/include/sdsl/enc_vector.hpp +++ b/include/sdsl/enc_vector.hpp @@ -306,6 +306,25 @@ enc_vector::enc_vector(int_vector_buffer& v_ v1 = v2; } + /* { + double sd_size_in_megabytes = 0; + size_t begin = 0, end=0; + while( begin != n) { + begin = end; + while ( end+1 < n and v_buf[end]::select_1(m_bottom, block_offset, in_block_offset, u); break; case hyb_sd_blocktype::EF: - // std::cout << "EF" << std::endl; + if (debug) { + std::cout << "EF" << std::endl; + g_hi_size = m_block_start[block_id+1]-m_block_start[block_id]; + } res += hyb_sd_block_support_ef::select_1(m_bottom, block_offset, in_block_offset, u); break; case hyb_sd_blocktype::FULL: - // std::cout << "FULL" << std::endl; + if (debug) std::cout << "FULL" << std::endl; res += hyb_sd_block_support_full::select_1(m_bottom, block_offset, in_block_offset, u); break; } diff --git a/include/sdsl/suffix_arrays.hpp b/include/sdsl/suffix_arrays.hpp index 834de2dbf..197fd69d2 100644 --- a/include/sdsl/suffix_arrays.hpp +++ b/include/sdsl/suffix_arrays.hpp @@ -28,6 +28,7 @@ #include "csa_bitcompressed.hpp" #include "csa_wt.hpp" #include "csa_sada.hpp" +#include "csa_sada2.hpp" #include "wavelet_trees.hpp" #include "construct.hpp" #include "suffix_array_algorithm.hpp" From 91d332d07307d87047eb89ebaf0775c3b19a3807 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Wed, 4 May 2016 16:32:29 +1000 Subject: [PATCH 07/21] With combined rank --- include/sdsl/csa_bitcompressed.hpp | 7 + include/sdsl/csa_sada.hpp | 7 +- include/sdsl/csa_sada2.hpp | 40 +-- include/sdsl/csa_wt.hpp | 9 +- include/sdsl/enc_vector2.hpp | 344 ++++++++++++++++++++++++ include/sdsl/hyb_sd_vector.hpp | 151 +++++------ include/sdsl/suffix_array_algorithm.hpp | 8 +- include/sdsl/suffix_array_helper.hpp | 15 +- 8 files changed, 466 insertions(+), 115 deletions(-) create mode 100644 include/sdsl/enc_vector2.hpp diff --git a/include/sdsl/csa_bitcompressed.hpp b/include/sdsl/csa_bitcompressed.hpp index ebcbc01c4..c4d5b09e3 100644 --- a/include/sdsl/csa_bitcompressed.hpp +++ b/include/sdsl/csa_bitcompressed.hpp @@ -302,6 +302,13 @@ class csa_bitcompressed } } + std::array + rank_bwt(std::array ij, const char_type c)const + { + return {rank_bwt(ij[0], c), rank_bwt(ij[1],c)}; + } + + // Calculates the i-th occurrence of symbol c in the BWT of the original text. /* * \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$. diff --git a/include/sdsl/csa_sada.hpp b/include/sdsl/csa_sada.hpp index c242f0683..cec3d5db9 100644 --- a/include/sdsl/csa_sada.hpp +++ b/include/sdsl/csa_sada.hpp @@ -463,7 +463,12 @@ class csa_sada return rank_comp_bwt(i, cc); } - + template + std::array + rank_bwt(std::array ij, const t_char c)const + { + return {rank_bwt(ij[0], c), rank_bwt(ij[1],c)}; + } // Calculates the position of the i-th c in the BWT of the original text. /* diff --git a/include/sdsl/csa_sada2.hpp b/include/sdsl/csa_sada2.hpp index b0c88677a..aca901697 100644 --- a/include/sdsl/csa_sada2.hpp +++ b/include/sdsl/csa_sada2.hpp @@ -140,6 +140,11 @@ class uef_psi_support return m_inc_seq_rank[cc](i); } + std::array rank(std::array ij, comp_char_type cc) const + { + return m_inc_seq_rank[cc](ij); + } + uint64_t select(uint64_t i, comp_char_type cc) const { return m_inc_seq_sel[cc](i); @@ -477,31 +482,11 @@ class csa_sada2 m_alphabet.load(in); } - // Calculates how many symbols cc are in the prefix [0..i-1] of the BWT of the original text. - /* - * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$. - * \param cc The compactified symbol to count in the prefix. - * \returns The number of occurrences of the compactified symbol cc in the prefix [0..i-1]. - * \par Time complexity - * \f$ \Order{\log n t_{\Psi}} \f$ - */ - template - size_type rank_comp_bwt(size_type i, const t_char cc)const - { - return m_psi_support.rank(i, cc); - } - - template - std::tuple double_rank_comp_bwt(size_type i, size_type j, const t_char cc)const - { -// return m_psi_support.rank({i, j}, cc); - return {rank_comp_bwt(i, cc), rank_comp_bwt(j, cc)}; - } - private: // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text. /* + * \tpara Type of index. Should either be an unsigned integer or and std::array<,2> of unsigned integers * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$. * \param c The symbol to count in the prefix. * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT. @@ -509,19 +494,18 @@ class csa_sada2 * \f$ \Order{\log n t_{\Psi}} \f$ */ // replace const char_type c by const std::array& c - template - size_type rank_bwt(size_type i, const t_char c)const + template + t_pos rank_bwt(t_pos i, const t_char c)const { auto cc = char2comp[c]; if (cc==0 and c!=0) // character is not in the text => return 0 - return 0; - if (i == 0) - return 0; - return rank_comp_bwt(i, cc); + return t_pos {0}; + if (i == t_pos {0}) + return t_pos {0}; + return m_psi_support.rank(i, cc); } - // Calculates the position of the i-th c in the BWT of the original text. /* * \param i The i-th occurrence. \f$i\in [1..rank_bwt(size(),c)]\f$. diff --git a/include/sdsl/csa_wt.hpp b/include/sdsl/csa_wt.hpp index e311cebe1..ce0508c55 100644 --- a/include/sdsl/csa_wt.hpp +++ b/include/sdsl/csa_wt.hpp @@ -269,11 +269,18 @@ class csa_wt * \par Time complexity * \f$ \Order{\log |\Sigma|} \f$ */ - size_type rank_bwt(size_type i, const char_type c)const + size_type + rank_bwt(size_type i, const char_type c)const { return m_wavelet_tree.rank(i, c); } + std::array + rank_bwt(std::array ij, const char_type c)const + { + return {rank_bwt(ij[0], c), rank_bwt(ij[1],c)}; + } + // Calculates the position of the i-th c in the BWT of the original text. /* * \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$. diff --git a/include/sdsl/enc_vector2.hpp b/include/sdsl/enc_vector2.hpp new file mode 100644 index 000000000..2f9516dc2 --- /dev/null +++ b/include/sdsl/enc_vector2.hpp @@ -0,0 +1,344 @@ +/* sdsl - succinct data structures library + Copyright (C) 2008 Simon Gog + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/ . +*/ +/*! \file enc_vector2.hpp + \brief enc_vector2.hpp contains the sdsl::enc_vector2 class. + \author Simon Gog +*/ +#ifndef SDSL_ENC_VECTORII +#define SDSL_ENC_VECTORII + +#include "int_vector.hpp" +#include "coder.hpp" +#include "iterators.hpp" + + +//! Namespace for the succinct data structure library. +namespace sdsl +{ + +template +struct enc_vector2_trait { + typedef int_vector<0> int_vector_type; +}; + +template<> +struct enc_vector2_trait<32> { + typedef int_vector<32> int_vector_type; +}; + +template<> +struct enc_vector2_trait<64> { + typedef int_vector<64> int_vector_type; +}; + +//! A generic immutable space-saving vector class for unsigned integers. +/*! A vector v is stored more space-efficiently by self-delimiting coding + * the deltas v[i+1]-v[i] (v[-1]:=0). Space of the structure and random + * access time to it can be controlled by a sampling parameter t_dens. + * + * \tparam t_coder Self-delimiting coder. + * \tparam t_dens Every t_dens-th element of v is sampled. + * \tparam t_width Width of the int_vector used to store the samples and pointers. + * This class is a parameter of csa_sada. + * @ingroup int_vector + */ +template +class enc_vector2 +{ + private: + static_assert(t_dens > 1 , "enc_vector2: sample density must be larger than `1`"); + public: + typedef uint64_t value_type; + typedef random_access_const_iterator iterator; + typedef iterator const_iterator; + typedef const value_type reference; + typedef const value_type const_reference; + typedef const value_type* const_pointer; + typedef ptrdiff_t difference_type; + typedef int_vector<>::size_type size_type; + typedef t_coder coder; + typedef typename enc_vector2_trait::int_vector_type int_vector_type; + typedef iv_tag index_category; + static constexpr uint32_t sample_dens = t_dens; + typedef enc_vector2 enc_vec_type; + + int_vector<0> m_z; // storage for encoded deltas + private: + int_vector_type m_samples; // samples + sd_vector<> m_pointers; + sd_vector<>::select_1_type m_pointers_sel; + size_type m_size = 0; // number of vector elements + + void clear() + { + m_z.resize(0); + m_size = 0; + m_samples.resize(0); + m_pointers = sd_vector<>(); + } + + public: + enc_vector2() = default; + enc_vector2(const enc_vector2&) = default; + enc_vector2(enc_vector2&&) = default; + enc_vector2& operator=(const enc_vector2&) = default; + enc_vector2& operator=(enc_vector2&&) = default; + + //! Constructor for a Container of unsigned integers. + /*! \param c A container of unsigned integers. + */ + template + enc_vector2(const Container& c); + + //! Constructor for an int_vector_buffer of unsigned integers. + /* + \param v_buf A int_vector_buf. + */ + template + enc_vector2(int_vector_buffer& v_buf); + + //! Default Destructor + ~enc_vector2() { } + + //! The number of elements in the enc_vector2. + size_type size()const + { + return m_size; + } + + //! Return the largest size that this container can ever have. + static size_type max_size() + { + return int_vector<>::max_size()/2; + } + + //! Returns if the enc_vector2 is empty. + bool empty() const + { + return 0==m_size; + } + + //! Swap method for enc_vector2 + void swap(enc_vector2& v); + + //! Iterator that points to the first element of the enc_vector2. + const const_iterator begin()const + { + return const_iterator(this, 0); + } + + //! Iterator that points to the position after the last element of the enc_vector2. + const const_iterator end()const + { + return const_iterator(this, this->m_size); + } + + //! operator[] + /*! \param i Index. \f$ i \in [0..size()-1]\f$. + */ + value_type operator[](size_type i)const; + + //! Serialize the enc_vector2 to a stream. + /*! \param out Out stream to write the data structure. + \return The number of written bytes. + */ + size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const; + + //! Load the enc_vector2 from a stream. + void load(std::istream& in); + + //! Returns the i-th sample of enc_vector2 + /*! \param i The index of the sample. 0 <= i < size()/get_sample_dens() + * \return The value of the i-th sample. + */ + value_type sample(const size_type i) const; + + uint32_t get_sample_dens() const + { + return t_dens; + } + + /*! + * \param i The index of the sample for which all values till the next sample should be decoded. 0 <= i < size()/get_sample_dens() + * \param it A pointer to a uint64_t vector, whereto the values should be written + */ + void get_inter_sampled_values(const size_type i, uint64_t* it)const + { + // TODO: this will not work for blocks with m_pointers_sel(i+1)+t_dens==m_pointers_sel(i+2) + *(it++) = 0; + if (i*t_dens + t_dens - 1 < size()) { + if (i+1 < m_samples.size() and m_samples[i] + t_dens == m_samples[i+1]) { + if (m_pointers_sel(i+1) != m_pointers_sel(i+2)) { + throw std::logic_error("Should not be here"); + } + uint64_t x = 1; + while (x < t_dens) { + *(it++) = x; + ++x; + } +// throw std::logic_error("Should not be here"); + } else { + t_coder::template decode(m_z.data(), m_pointers_sel(i+1), t_dens - 1, it); + } + } else { + assert(i*t_dens < size()); + t_coder::template decode(m_z.data(), m_pointers_sel(i+1), size()-i*t_dens - 1, it); + } + }; +}; + +template +inline typename enc_vector2::value_type enc_vector2::operator[](const size_type i)const +{ + assert(i+1 != 0); + assert(i < m_size); + size_type idx = i/get_sample_dens(); + if (idx+1 < m_samples.size() and m_samples[idx]+t_dens == m_samples[idx+1]) { + return m_samples[idx] + i-t_dens*idx; + } + return m_samples[idx] + t_coder::decode_prefix_sum(m_z.data(), m_pointers_sel(idx+1), i-t_dens*idx); +} + +template +inline typename enc_vector2::value_type enc_vector2::sample(const size_type i)const +{ + assert(i*get_sample_dens()+1 != 0); + assert(i*get_sample_dens() < m_size); + return m_samples[i]; +} + +template +void enc_vector2::swap(enc_vector2& v) +{ + if (this != &v) { // if v and _this_ are not the same object + m_z.swap(v.m_z); + m_samples.swap(v.m_samples); + m_pointers.swap(v.m_pointers); + util::swap_support(m_pointers_sel, v.m_pointers_sel, &m_pointers, &v.m_pointers); + std::swap(m_size, v.m_size); + } +} + +template +template +enc_vector2::enc_vector2(int_vector_buffer& v_buf) +{ + // clear bit_vectors + clear(); + size_type n = v_buf.size(); + if (n == 0) // if c is empty there is nothing to do... + return; + value_type v1=0, v2=0, max_sample_value=0; + size_type samples=0, z_size=0; + const size_type sd = get_sample_dens(); + size_type tmp_z = 0; + bool uniform = true; +// (1) Calculate maximal value of samples and of deltas + for (size_type i=0, no_sample = 0; i < n; ++i, --no_sample) { + v2 = v_buf[i]; + if (!no_sample) { // is sample + uniform &= (v2==v1+1); + if (!uniform) { + z_size += tmp_z; + } + uniform = true; + tmp_z = 0; + no_sample = sd; + if (max_sample_value < v2) max_sample_value = v2; + ++samples; + } else { + uniform &= (v2==v1+1); + tmp_z += t_coder::encoding_length(v2-v1); + } + v1 = v2; + } + z_size += tmp_z; + +// (2) Write sample values and deltas +// (a) Initialize array for sample values and pointers + m_samples = int_vector<>(samples+1, 0, bits::hi(max_sample_value)+1); + + sd_vector_builder builder(z_size, samples); + +// (b) Initilize bit_vector for encoded data + m_z = int_vector<>(z_size, 0, 1); + uint64_t* z_data = t_coder::raw_data(m_z); + uint8_t offset = 0; + +// (c) Write sample values and deltas + z_size = 0; + tmp_z = 0; + uniform = true; + std::vector delta; + for (size_type i=0, j=0, no_sample = 0; i < n; ++i, --no_sample) { + v2 = v_buf[i]; + if (!no_sample) { // is sample + uniform &= (v2==v1+1); + if (!uniform) { + for (size_t k=0; k(builder); + m_pointers_sel.set_vector(&m_pointers); +} + +template +enc_vector2<>::size_type enc_vector2::serialize(std::ostream& out, structure_tree_node* v, std::string name)const +{ + structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); + size_type written_bytes = 0; + written_bytes += write_member(m_size, out, child, "size"); + written_bytes += m_z.serialize(out, child, "encoded deltas"); + written_bytes += m_samples.serialize(out, child, "samples"); + written_bytes += m_pointers.serialize(out, child, "pointers"); + written_bytes += m_pointers_sel.serialize(out, child, "pointers_sel"); + structure_tree::add_size(child, written_bytes); + return written_bytes; +} + +template +void enc_vector2::load(std::istream& in) +{ + read_member(m_size, in); + m_z.load(in); + m_samples.load(in); + m_pointers.load(in); + m_pointers_sel.load(in); + m_pointers_sel.set_vector(&m_pointers); +} + +} // end namespace sdsl +#endif diff --git a/include/sdsl/hyb_sd_vector.hpp b/include/sdsl/hyb_sd_vector.hpp index c763b8e31..8145d6782 100644 --- a/include/sdsl/hyb_sd_vector.hpp +++ b/include/sdsl/hyb_sd_vector.hpp @@ -31,9 +31,7 @@ namespace sdsl { - -constexpr bool debug=false; -size_t g_hi_size=0; +//std::vector g_range_stats; template std::string print_vec(t_itr beg, t_itr end) @@ -152,7 +150,6 @@ inline uint64_t sel0(const uint64_t* word, uint64_t idx, uint64_t i, uint64_t ma while (cnt + word_cnt < i) { cnt += word_cnt; if (considered > max_considered) { -// std::cout << "LOOP considered = " << considered << " max_considered = " << max_considered << std::endl; return std::numeric_limits::max(); } res = considered; @@ -164,8 +161,6 @@ inline uint64_t sel0(const uint64_t* word, uint64_t idx, uint64_t i, uint64_t ma // add select (i-cnt) to res res += sdsl::bits::sel(w, i - cnt); if (res > max_considered) { -// std::cout << "res = " << res << " i = " << i << std::endl; -// std::cout << "IF considered = " << considered << " max_considered = " << max_considered << std::endl; return std::numeric_limits::max(); } return res; @@ -186,15 +181,11 @@ class hyb_sd_block_support_bv static size_type serialize(sdsl::bit_vector& bv, size_type offset, sdsl::int_vector<64>& data, size_type) { -// std::cout<<"write BV of size "<::select_1(m_bottom, block_offset, in_block_offset, u); break; case hyb_sd_blocktype::EF: - if (debug) { - std::cout << "EF" << std::endl; - g_hi_size = m_block_start[block_id+1]-m_block_start[block_id]; - } res += hyb_sd_block_support_ef::select_1(m_bottom, block_offset, in_block_offset, u); break; case hyb_sd_blocktype::FULL: - if (debug) std::cout << "FULL" << std::endl; res += hyb_sd_block_support_full::select_1(m_bottom, block_offset, in_block_offset, u); break; } @@ -757,7 +678,71 @@ class hyb_sd_vector res += hyb_sd_block_support_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; } + return res; + } + + std::array + rank_1(std::array ij) const + { + if (ij[0] > ij[1]) { + return {rank_1(ij[0]),rank_1(ij[1])}; + } + // no we know ij[0] <= ij[1] + if (ij[0] > m_size or m_num_ones == 0) { + return {m_num_ones, m_num_ones}; + } + auto block_id = m_top_rank(ij[0]); + if (block_id == 0) { + size_type first_element = m_top_sel(1); + if (ij[1] <= first_element) { + return {0,0}; + } + return {0, rank_1(ij[1])}; // TODO: can still be optimized + } + block_id -= 1; + size_type r = block_id * t_block_size; + auto top_value = m_top_sel(block_id + 1); + size_type in_block_i = ij[0]; + in_block_i -= top_value; + size_type in_block_j = ij[1]; + in_block_j -= top_value; + + if (in_block_i == 0) { + if (ij[0]==ij[1]) { + return {r,r}; + } + return {r, rank_1(ij[1])}; // TODO: can still be optimized + } + auto u = m_top_sel(block_id + 2) - top_value; + auto bt = determine_block_type(u); + auto block_type = bt.first; + size_type block_offset = m_block_start[block_id]; + std::array res {r,r}; + + switch (block_type) { + case hyb_sd_blocktype::BV: + res[0] += hyb_sd_block_support_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + if (in_block_j < u) { + res[1] += hyb_sd_block_support_bv::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + } + break; + case hyb_sd_blocktype::EF: + res[0] += hyb_sd_block_support_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + if (in_block_j < u) { + res[1] += hyb_sd_block_support_ef::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + } + break; + case hyb_sd_blocktype::FULL: + res[0] += hyb_sd_block_support_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + if (in_block_j < u) { + res[1] += hyb_sd_block_support_full::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + } + break; + } + if (in_block_j >= u) { + res[1] = rank_1(ij[1]); + } return res; } @@ -912,12 +897,14 @@ class rank_support_hyb_sd set_vector(v); } - size_type rank(size_type i) const + template + t_pos rank(t_pos i) const { return m_v->rank_1(i); } - size_type operator()(size_type i) const + template + t_pos operator()(t_pos i) const { return rank(i); } diff --git a/include/sdsl/suffix_array_algorithm.hpp b/include/sdsl/suffix_array_algorithm.hpp index 1a1105eb9..0896e86dc 100644 --- a/include/sdsl/suffix_array_algorithm.hpp +++ b/include/sdsl/suffix_array_algorithm.hpp @@ -22,6 +22,7 @@ #define INCLUDED_SDSL_SUFFIX_ARRAY_ALGORITHM #include +#include #include "suffix_array_helper.hpp" namespace sdsl @@ -182,8 +183,11 @@ typename t_csa::size_type backward_search( l_res = c_begin; r_res = csa.C[cc+1] - 1; } else { - l_res = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1] - r_res = c_begin + csa.bwt.rank(r+1, c) - 1; // count c in bwt[0..r] + auto lr = csa.bwt.rank(std::array {l,r+1},c); + l_res = c_begin + lr[0]; // count c in bwt[0..l-1] + r_res = c_begin + lr[1] - 1; // count c in bwt[0..r] +// l_res = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1] +// r_res = c_begin + csa.bwt.rank(r+1, c) - 1; // count c in bwt[0..r] } } assert(r_res+1-l_res >= 0); diff --git a/include/sdsl/suffix_array_helper.hpp b/include/sdsl/suffix_array_helper.hpp index ef5334830..bec5f4aa9 100644 --- a/include/sdsl/suffix_array_helper.hpp +++ b/include/sdsl/suffix_array_helper.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include "iterators.hpp" namespace sdsl @@ -273,7 +274,13 @@ class bwt_of_csa_psi template size_type rank(size_type i, const t_char c)const { - return m_csa.rank_bwt(i,c); + return m_csa.rank_bwt(i, c); + } + + template + std::array rank(std::array ij, const t_char c)const + { + return m_csa.rank_bwt(ij, c); } //! Calculates the position of the i-th c. @@ -447,6 +454,12 @@ class bwt_of_csa_wt return m_csa.rank_bwt(i, c); } + template + std::array rank(std::array ij, const t_char c)const + { + return m_csa.rank_bwt(ij, c); + } + //! Calculates the position of the i-th c. /*! * \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$. From 341e2ec5c712c0c13ab4171a9568b4e341713297 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Thu, 5 May 2016 16:04:46 +1000 Subject: [PATCH 08/21] Added RL block type; restructering of code will follow --- include/sdsl/coder_elias_delta.hpp | 12 + include/sdsl/csa_sada2.hpp | 1 + include/sdsl/hyb_sd_vector.hpp | 462 +++++++++++++++++++++++------ 3 files changed, 378 insertions(+), 97 deletions(-) diff --git a/include/sdsl/coder_elias_delta.hpp b/include/sdsl/coder_elias_delta.hpp index 5ecd5ec76..f6acccba4 100644 --- a/include/sdsl/coder_elias_delta.hpp +++ b/include/sdsl/coder_elias_delta.hpp @@ -140,6 +140,18 @@ class elias_delta */ static void encode(uint64_t x, uint64_t*& z, uint8_t& offset); + + static uint64_t decode(const uint64_t*& z, uint8_t& offset) + { + size_type len_1_len; + len_1_len = bits::read_unary_and_move(z, offset); // read length of length of x + if (!len_1_len) { + return 1ULL; + } + size_type len = bits::read_int_and_move(z, offset, len_1_len) + (1ULL << len_1_len); + return bits::read_int_and_move(z, offset, len-1) + (len-1<64) * (1ULL << (len-1)); + } + template static uint64_t* raw_data(int_vector& v) { diff --git a/include/sdsl/csa_sada2.hpp b/include/sdsl/csa_sada2.hpp index aca901697..30fcd5bf4 100644 --- a/include/sdsl/csa_sada2.hpp +++ b/include/sdsl/csa_sada2.hpp @@ -84,6 +84,7 @@ class uef_psi_support uef_psi_support(int_vector_buffer<>& psi_buf, const t_csa* csa) { // std::cout<<"Hello!!!!"<C; // std::cout<<"C.size()="< g_range_stats; +size_t g_saved_bits=0; template std::string print_vec(t_itr beg, t_itr end) @@ -49,9 +51,9 @@ std::string print_vec(t_itr beg, t_itr end) inline uint64_t next0(const uint64_t* word, uint64_t idx) { word += (idx >> 6); - auto masked_inverse_word = ~(*word | sdsl::bits::lo_set[(idx & 0x3F) + 1]); + auto masked_inverse_word = ~(*word | bits::lo_set[(idx & 0x3F) + 1]); if (masked_inverse_word) { - return (idx & ~((size_t)0x3F)) + sdsl::bits::lo(masked_inverse_word); + return (idx & ~((size_t)0x3F)) + bits::lo(masked_inverse_word); } idx = (idx & ~((size_t)0x3F)) + 64; ++word; @@ -59,7 +61,7 @@ inline uint64_t next0(const uint64_t* word, uint64_t idx) idx += 64; ++word; } - return idx + sdsl::bits::lo(~(*word)); + return idx + bits::lo(~(*word)); } /*! @@ -81,7 +83,7 @@ inline uint64_t cnt(const uint64_t* word, uint64_t idx, uint64_t i) // std::cout<<"considered="<= t_block_size) { return t_block_size; @@ -94,12 +96,12 @@ inline uint64_t cnt(const uint64_t* word, uint64_t idx, uint64_t i) //std::cout<<"considered="< decode(const uint64_t* data_ptr, uint8_t offset, const uint64_t* data_ptr_end, uint8_t offset_end) + { + int_vector<64> data(t_block_size, 0); + size_t pos = 1; // data[0]=0, now decode for pos > 0 + while (pos < t_block_size) { + if (data_ptr > data_ptr_end or (data_ptr == data_ptr_end and offset >= offset_end)) { +// std::cout<<"entering corner case"<&, size_type, size_type i, size_type) + static size_type rank_1(const bit_vector&, const int_vector<>&, size_type, size_type i, size_type) { - return i; + return std::min(t_block_size, i); } }; template -class hyb_sd_block_support_ef +class hyb_sd_block_ef { public: typedef bit_vector::size_type size_type; @@ -250,8 +455,8 @@ class hyb_sd_block_support_ef public: static size_type estimate_size(size_type u) { - uint8_t logu = sdsl::bits::hi(u) + 1; - uint8_t logm = sdsl::bits::hi(t_block_size) + 1; // TODO constexpr for hi? + uint8_t logu = bits::hi(u) + 1; + uint8_t logm = bits::hi(t_block_size) + 1; // TODO constexpr for hi? if (logm == logu) logm--; size_type width_low = logu - logm; @@ -259,11 +464,11 @@ class hyb_sd_block_support_ef return size_in_bits; } static size_type - serialize(sdsl::bit_vector& bv, size_type offset, sdsl::int_vector<64>& data, size_type u) + serialize(bit_vector& bv, size_type offset, int_vector<64>& data, size_type u) { size_type written_bits = 0; - uint8_t logu = sdsl::bits::hi(u) + 1; - uint8_t logm = sdsl::bits::hi(t_block_size) + 1; + uint8_t logu = bits::hi(u) + 1; + uint8_t logm = bits::hi(t_block_size) + 1; if (logm == logu) logm--; size_type width_low = logu - logm; @@ -272,7 +477,7 @@ class hyb_sd_block_support_ef uint8_t in_word_offset = offset % 64; for (size_type i = 0; i < t_block_size; i++) { uint64_t x = data[i]; - sdsl::bits::write_int_and_move(data_ptr, x, in_word_offset, width_low); + bits::write_int_and_move(data_ptr, x, in_word_offset, width_low); } written_bits += width_low * t_block_size; @@ -283,11 +488,11 @@ class hyb_sd_block_support_ef size_type cur_high = x >> width_low; size_type write_val = cur_high - last_high; while (write_val >= 64) { - sdsl::bits::write_int_and_move(data_ptr, 0ULL, in_word_offset, 64); + bits::write_int_and_move(data_ptr, 0ULL, in_word_offset, 64); write_val -= 64; written_bits += 64; } - sdsl::bits::write_int_and_move(data_ptr, 1ULL << write_val, in_word_offset, write_val + 1); + bits::write_int_and_move(data_ptr, 1ULL << write_val, in_word_offset, write_val + 1); last_high = cur_high; written_bits += write_val + 1; } @@ -296,10 +501,10 @@ class hyb_sd_block_support_ef return written_bits; } - static size_type select_1(const sdsl::bit_vector& bv, size_type offset, size_type i, size_type u) + static size_type select_1(const bit_vector& bv, size_type offset, size_type i, size_type u) { - uint8_t logu = sdsl::bits::hi(u) + 1; - uint8_t logm = sdsl::bits::hi(t_block_size) + 1; + uint8_t logu = bits::hi(u) + 1; + uint8_t logm = bits::hi(t_block_size) + 1; if (logm == logu) logm--; size_type width_low = logu - logm; @@ -308,19 +513,19 @@ class hyb_sd_block_support_ef auto low_part_data_ptr = bv.data() + (low_part_offset / 64); uint8_t low_part_in_word_offset = low_part_offset % 64; - auto low_part = sdsl::bits::read_int(low_part_data_ptr, low_part_in_word_offset, width_low); + auto low_part = bits::read_int(low_part_data_ptr, low_part_in_word_offset, width_low); auto bucket = sel(bv.data(), hi_part_offset, i + 1) - hi_part_offset - i; return (bucket << width_low) | low_part; } - static size_type rank_1(const sdsl::bit_vector& bv, const int_vector<>& block_start, size_type block_id, size_type i, size_type u) + static size_type rank_1(const bit_vector& bv, const int_vector<>& block_start, size_type block_id, size_type i, size_type u) { auto offset = block_start[block_id]; auto next_offset = block_start[block_id + 1]; - uint8_t logu = sdsl::bits::hi(u) + 1; - uint8_t logm = sdsl::bits::hi(t_block_size) + 1; + uint8_t logu = bits::hi(u) + 1; + uint8_t logm = bits::hi(t_block_size) + 1; if (logm == logu) logm--; size_type width_low = logu - logm; @@ -353,7 +558,7 @@ class hyb_sd_block_support_ef low_part_offset -= width_low; low_part_data_ptr = bv.data() + (low_part_offset / 64); low_part_in_word_offset = low_part_offset % 64; - } while (bv[hi_part_offset + sel_high] and sdsl::bits::read_int(low_part_data_ptr, low_part_in_word_offset, width_low) >= val_low); + } while (bv[hi_part_offset + sel_high] and bits::read_int(low_part_data_ptr, low_part_in_word_offset, width_low) >= val_low); return rank_low + 1; } }; @@ -386,8 +591,10 @@ class hyb_sd_vector sd_vector<> m_top; sd_vector<>::select_1_type m_top_sel; sd_vector<>::rank_1_type m_top_rank; - sdsl::bit_vector m_bottom; - sdsl::int_vector<> m_block_start; + bit_vector m_bottom; + int_vector<> m_block_start; + int_vector<2> m_block_type; + size_type m_size = 0; size_type m_num_ones = 0; @@ -396,51 +603,62 @@ class hyb_sd_vector static constexpr uint16_t block_size = t_block_size; private: - enum class hyb_sd_blocktype + enum class hyb_sd_blocktype : uint8_t { - EF, - BV, - FULL + EF=0, + BV=1, + FULL=2, + RL=3 }; std::pair - determine_block_type(size_t u) const + determine_block_type(size_t u, int_vector<64>& data) const { - if (u == t_block_size) + if (u == t_block_size or data[t_block_size-1] == t_block_size-1) return { hyb_sd_blocktype::FULL, 0 }; - size_type ef_bits = hyb_sd_block_support_ef::estimate_size(u); - size_type bv_bits = hyb_sd_block_support_bv::estimate_size(u); - if (bv_bits < ef_bits) - return { hyb_sd_blocktype::BV, bv_bits }; - return { hyb_sd_blocktype::EF, ef_bits }; + std::vector> size_and_type; + size_and_type.push_back({hyb_sd_block_ef::estimate_size(u), hyb_sd_blocktype::EF}); + size_and_type.push_back({hyb_sd_block_bv::estimate_size(u), hyb_sd_blocktype::BV}); + size_and_type.push_back({hyb_sd_block_rl::estimate_size(u, data), hyb_sd_blocktype::RL}); + std::sort(size_and_type.begin(), size_and_type.end()); + + /* auto mini = std::min_element(size_and_type.begin(), size_and_type.end()); + return { std::get<1>(*mini), std::get<0>(*mini) }; + */ + if (std::get<1>(size_and_type[0]) == hyb_sd_blocktype::RL) { + auto rl_size = std::get<0>(size_and_type[0]); + auto next_size = std::get<0>(size_and_type[1]); + if (2*rl_size > next_size) { + std::swap(size_and_type[0], size_and_type[1]); + } + } + return {std::get<1>(size_and_type[0]), std::get<0>(size_and_type[0])}; } - size_type compress_block(size_type offset, sdsl::int_vector<64>& data, size_t u) + size_type compress_block(size_type i, int_vector<64>& data, size_t u) { + size_type offset = m_block_start[i]; size_type written_bits = 0; - auto bt = determine_block_type(u); + auto bt = determine_block_type(u, data); auto type = bt.first; auto size_in_bits = bt.second; + m_block_type[i] = static_cast(type); if (m_bottom.size() < offset + size_in_bits) { m_bottom.resize(m_bottom.size() * 2 + size_in_bits); } - - for (size_t i=1; i(m_block_type[block_id]); size_type block_offset = m_block_start[block_id]; switch (block_type) { case hyb_sd_blocktype::BV: - res += hyb_sd_block_support_bv::select_1(m_bottom, block_offset, in_block_offset, u); + res += hyb_sd_block_bv::select_1(m_bottom, block_offset, in_block_offset, u); break; case hyb_sd_blocktype::EF: - res += hyb_sd_block_support_ef::select_1(m_bottom, block_offset, in_block_offset, u); + res += hyb_sd_block_ef::select_1(m_bottom, block_offset, in_block_offset, u); break; case hyb_sd_blocktype::FULL: - res += hyb_sd_block_support_full::select_1(m_bottom, block_offset, in_block_offset, u); + res += hyb_sd_block_full::select_1(m_bottom, block_offset, in_block_offset, u); + break; + case hyb_sd_blocktype::RL: + res += hyb_sd_block_rl::select_1(m_bottom, m_block_start, block_id, in_block_offset, u); +// res += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; } return res; @@ -660,22 +882,32 @@ class hyb_sd_vector if (in_block_i == 0) return res; + auto block_type = static_cast(m_block_type[block_id]); + if (block_type == hyb_sd_blocktype::FULL) { + return res + hyb_sd_block_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, 0); + } + auto u = m_top_sel(block_id + 2) - top_value; - auto bt = determine_block_type(u); - auto block_type = bt.first; +// auto bt = determine_block_type(u); +// auto block_type = bt.first; size_type block_offset = m_block_start[block_id]; + switch (block_type) { case hyb_sd_blocktype::BV: // std::cout << "BV" << std::endl; - res += hyb_sd_block_support_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + res += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; case hyb_sd_blocktype::EF: // std::cout << "EF" << std::endl; - res += hyb_sd_block_support_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + res += hyb_sd_block_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; case hyb_sd_blocktype::FULL: // std::cout << "FULL" << std::endl; - res += hyb_sd_block_support_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + res += hyb_sd_block_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + break; + case hyb_sd_blocktype::RL: +// std::cout << "RL" << std::endl; + res += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; } return res; @@ -714,29 +946,47 @@ class hyb_sd_vector return {r, rank_1(ij[1])}; // TODO: can still be optimized } + auto block_type = static_cast(m_block_type[block_id]); + if (block_type == hyb_sd_blocktype::FULL and in_block_j < t_block_size) { + return {r+hyb_sd_block_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, 0), + r+hyb_sd_block_full::rank_1(m_bottom, m_block_start, block_id, in_block_j, 0) + }; + } + auto u = m_top_sel(block_id + 2) - top_value; - auto bt = determine_block_type(u); - auto block_type = bt.first; +// auto bt = determine_block_type(u); +// auto block_type = bt.first; size_type block_offset = m_block_start[block_id]; std::array res {r,r}; switch (block_type) { case hyb_sd_blocktype::BV: - res[0] += hyb_sd_block_support_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + res[0] += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); if (in_block_j < u) { - res[1] += hyb_sd_block_support_bv::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + res[1] += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); } break; case hyb_sd_blocktype::EF: - res[0] += hyb_sd_block_support_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + res[0] += hyb_sd_block_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); if (in_block_j < u) { - res[1] += hyb_sd_block_support_ef::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + res[1] += hyb_sd_block_ef::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); } break; case hyb_sd_blocktype::FULL: - res[0] += hyb_sd_block_support_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + res[0] += hyb_sd_block_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); if (in_block_j < u) { - res[1] += hyb_sd_block_support_full::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + res[1] += hyb_sd_block_full::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + } + break; + case hyb_sd_blocktype::RL: + if (in_block_j >= u) { + res[0] += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + } else { + res[0] += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + res[1] += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + // auto in_block_rank = hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, {in_block_i, in_block_j}, u); + // res[0] += in_block_rank[0]; + // res[1] += in_block_rank[1]; } break; } @@ -772,8 +1022,24 @@ class hyb_sd_vector written_bytes += m_top.serialize(out, child, "top"); written_bytes += m_top_sel.serialize(out, child, "top_sel"); written_bytes += m_top_rank.serialize(out, child, "top_rank"); - written_bytes += m_bottom.serialize(out, child, "bottom"); + //written_bytes += m_bottom.serialize(out, nullptr, "bottom"); + auto bottom_bytes = m_bottom.serialize(out, nullptr, "bottom"); + { + structure_tree_node* bottom_child = structure_tree::add_child(child, "bottom", util::class_name(m_bottom)); + std::array written_bits = {0,0,0,0}; + for (size_t i=1; i names = {"EF","BV","FULL","RL"}; + for (size_t i=0; i Date: Fri, 6 May 2016 18:05:39 +1000 Subject: [PATCH 09/21] Combined rank calculation for EF --- include/sdsl/hyb_sd_vector.hpp | 263 ++++++++++++++++++++++++++------- 1 file changed, 209 insertions(+), 54 deletions(-) diff --git a/include/sdsl/hyb_sd_vector.hpp b/include/sdsl/hyb_sd_vector.hpp index 7f1c42822..5253537c0 100644 --- a/include/sdsl/hyb_sd_vector.hpp +++ b/include/sdsl/hyb_sd_vector.hpp @@ -138,7 +138,7 @@ inline uint64_t sel(const uint64_t* word, uint64_t idx, uint64_t i) * \param i i * \return The absolut position (in bits) of the i-th unset bit (\f$ i>0 \f$ from idx */ -inline uint64_t sel0(const uint64_t* word, uint64_t idx, uint64_t i, uint64_t max_considered) +inline uint64_t sel0(const uint64_t* word, uint64_t idx, uint64_t i) { // std::cout<<"cnt("<> 6); @@ -151,9 +151,6 @@ inline uint64_t sel0(const uint64_t* word, uint64_t idx, uint64_t i, uint64_t ma while (cnt + word_cnt < i) { cnt += word_cnt; - if (considered > max_considered) { - return std::numeric_limits::max(); - } res = considered; considered += 64; w = (~(*(++word))); @@ -162,9 +159,6 @@ inline uint64_t sel0(const uint64_t* word, uint64_t idx, uint64_t i, uint64_t ma // cnt < i and cnt+word_cnt >= i // add select (i-cnt) to res res += bits::sel(w, i - cnt); - if (res > max_considered) { - return std::numeric_limits::max(); - } return res; } @@ -196,7 +190,8 @@ class hyb_sd_block_bv return sel(bv.data(), offset, i + 1) - offset; } - static size_type rank_1(const bit_vector& bv, const int_vector<>& block_start, size_type block_id, size_type i, size_type) + static size_type + rank_1(const bit_vector& bv, const int_vector<>& block_start, size_type block_id, size_type i, size_type) { auto offset = block_start[block_id]; auto next_offset = block_start[block_id+1]; @@ -204,6 +199,22 @@ class hyb_sd_block_bv return t_block_size; return cnt(bv.data(), offset, i); } + + static std::array + rank_1(const bit_vector& bv, const int_vector<>& block_start, size_type block_id, const std::array& ij, size_type) + { + auto offset = block_start[block_id]; + auto next_offset = block_start[block_id+1]; + if (ij[0] > next_offset-offset) { + return {t_block_size,t_block_size}; + } + auto resi = cnt(bv.data(), offset, ij[0]); + if (ij[1] > next_offset-offset) { + return {resi, t_block_size}; + } + return {resi, resi+cnt(bv.data(), offset+ij[0], ij[1]-ij[0])}; + } + }; @@ -255,27 +266,28 @@ class hyb_sd_block_rl g_saved_bits += t_coder::encoding_length(end-1-begin); } } - - if (bv!=nullptr) { -// std::cout<<"Checking block "<& block_start, size_type block_id, size_type i, size_type) { - auto abs_offset = block_start[block_id]; auto data_ptr = bv.data()+(abs_offset/64); uint8_t offset = abs_offset%64; @@ -343,24 +354,22 @@ class hyb_sd_block_rl while (res < t_block_size and i > data_res) { if (data_ptr > data_ptr_end or (data_ptr == data_ptr_end and offset >= offset_end)) { uint64_t len = t_block_size - res; - if (i > data_res + len) { - data_res += len; + data_res += len; + if (i > data_res) { res += len; - } else { // i <= data_res + len and i > data_res - uint64_t gap = i - data_res; - data_res += gap; + } else { // i <= data_res and i > data_res - len + uint64_t gap = i - (data_res - len); res += gap; } } else { uint64_t delta = t_coder::decode(data_ptr,offset); if (delta == 1) { // encoded run of ones of length >= 1 uint64_t len = t_coder::decode(data_ptr, offset); - if (i > data_res + len) { - data_res += len; + data_res += len; + if (i > data_res) { res += len; - } else { // i <= data_res + len and i > data_res - uint64_t gap = i - data_res; - data_res += gap; + } else { // i <= data_res and i > data_res - len + uint64_t gap = i - (data_res - len); res += gap; } } else { // single delta @@ -376,6 +385,81 @@ class hyb_sd_block_rl return res; } + static std::array + rank_1(const bit_vector& bv, const int_vector<>& block_start, size_type block_id, const std::array& ij, size_type) + { + auto abs_offset = block_start[block_id]; + auto data_ptr = bv.data()+(abs_offset/64); + uint8_t offset = abs_offset%64; + auto abs_offset_end = block_start[block_id+1]; + auto data_ptr_end = bv.data()+(abs_offset_end/64); + uint8_t offset_end = abs_offset_end%64; + + uint64_t data_res = 0; + std::array res = {0,0}; // data[0]=0, now decode for pos > 0 + size_t k=0; + for (; k<2; ++k) { + while (res[k] < t_block_size and ij[k] > data_res) { + if (data_ptr > data_ptr_end or (data_ptr == data_ptr_end and offset >= offset_end)) { + uint64_t len = t_block_size - res[k]; + data_res += len; + if (ij[k] > data_res) { + res[k] += len; + } else { // ij[k] <= data_res and i > data_res - len + uint64_t gap = ij[k] - (data_res -len); + res[k] += gap; + if (k == 0) { + if (ij[1] <= data_res) { + res[1] = res[0] + (ij[1]-ij[0]); + k = 3; break; + } else { + res[1] = res[0] - gap + len; + } + } + } + } else { + uint64_t delta = t_coder::decode(data_ptr,offset); + if (delta == 1) { // encoded run of ones of length >= 1 + uint64_t len = t_coder::decode(data_ptr, offset); + data_res += len; + if (ij[k] > data_res) { + res[k] += len; + } else { // ij[k] <= data_res and i > data_res-len + uint64_t gap = ij[k] - (data_res-len); + res[k] += gap; + if (k == 0) { + if (ij[1] <= data_res) { + res[1] = res[0] + (ij[1]-ij[0]); + k = 3; break; + } else { + res[1] = res[0] - gap + len; + } + } + } + } else { // single delta + data_res += delta; + ++res[k]; + } + } + } + if (k==0) { + res[1]= std::max(res[0], res[1]); + } + } + /* + std::array check = { rank_1(bv, block_start, block_id, ij[0], 0), + rank_1(bv, block_start, block_id, ij[1], 0)}; + if ( res != check ){ + std::cerr<<"res!=check"<& block_start, size_type block_id, size_type i, size_type) @@ -534,10 +618,11 @@ class hyb_sd_block_ef size_type hi_size = next_offset - hi_part_offset; size_type high_val = (i >> width_low); - size_type local_sel = sel0(bv.data(), hi_part_offset, high_val + 1, hi_size); - if (local_sel == std::numeric_limits::max()) { + size_type zeros_in_high = hi_size - t_block_size; + if (zeros_in_high < high_val+1) { return t_block_size; } + size_type local_sel = sel0(bv.data(), hi_part_offset, high_val + 1); size_type sel_high = local_sel; size_type rank_low = sel_high - high_val; @@ -561,6 +646,73 @@ class hyb_sd_block_ef } while (bv[hi_part_offset + sel_high] and bits::read_int(low_part_data_ptr, low_part_in_word_offset, width_low) >= val_low); return rank_low + 1; } + + static std::array + rank_1(const bit_vector& bv, const int_vector<>& block_start, size_type block_id, std::array ij, size_type u) + { + auto start_offset = block_start[block_id]; + auto next_offset = block_start[block_id + 1]; + + uint8_t logu = bits::hi(u) + 1; + uint8_t logm = bits::hi(t_block_size) + 1; + if (logm == logu) + logm--; + size_type width_low = logu - logm; + + size_type hi_part_offset = start_offset + t_block_size * width_low; + size_type hi_size = next_offset - hi_part_offset; + + std::array high_val = {(ij[0] >> width_low),(ij[1] >> width_low)}; + + size_type zeros_in_high = hi_size - t_block_size; + if (zeros_in_high < high_val[0]+1) { // check if there is a zero to select + return {t_block_size, t_block_size}; + } + std::array res = {0,0}; + std::array local_sel; + local_sel[0]= sel0(bv.data(), hi_part_offset, high_val[0] + 1); + if (high_val[0] == high_val[1]) { + local_sel[1] = local_sel[0]; + } else { // now high_val[0] < high_val[1] + if (zeros_in_high < high_val[1]+1) { + res = {0, t_block_size}; // initialized second result + } else { + if (zeros_in_high < high_val[1]+1) { // check if there is a zero to select + res = {0, t_block_size}; + } else { // there is something to select ;) + size_type skip = local_sel[0]+1; + local_sel[1] = sel0(bv.data(), hi_part_offset+skip, high_val[1]-high_val[0]) + skip; + } + } + } + + bool done1 = (res[1]==t_block_size); + for (size_t k=1-done1, s=done1; s<2; ++s, --k) { + size_type sel_high = local_sel[k]; + size_type rank_low = sel_high - high_val[k]; + if (0 == rank_low) { + return {0,res[1]}; + } + + size_type low_part_offset = start_offset + rank_low * width_low; + size_type val_low = ij[k] & bits::lo_set[width_low]; + auto low_part_data_ptr = bv.data() + (low_part_offset / 64); + uint8_t low_part_in_word_offset = low_part_offset % 64; + + do { + if (!sel_high) { + return {0, res[1]}; + } + --rank_low; + --sel_high; + low_part_offset -= width_low; + low_part_data_ptr = bv.data() + (low_part_offset / 64); + low_part_in_word_offset = low_part_offset % 64; + } while (bv[hi_part_offset + sel_high] and bits::read_int(low_part_data_ptr, low_part_in_word_offset, width_low) >= val_low); + res[k] = rank_low+1; + } + return res; + } }; template ::select_1(m_bottom, m_block_start, block_id, in_block_offset, u); -// res += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; } return res; @@ -961,15 +1112,21 @@ class hyb_sd_vector switch (block_type) { case hyb_sd_blocktype::BV: - res[0] += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); - if (in_block_j < u) { - res[1] += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + if (in_block_j >= u) { + res[0] += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + } else { + auto in_block_rank = hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, {in_block_i, in_block_j}, u); + res[0] += in_block_rank[0]; + res[1] += in_block_rank[1]; } break; case hyb_sd_blocktype::EF: - res[0] += hyb_sd_block_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); - if (in_block_j < u) { - res[1] += hyb_sd_block_ef::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); + if (in_block_j >= u) { + res[0] += hyb_sd_block_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); + } else { + auto in_block_rank = hyb_sd_block_ef::rank_1(m_bottom, m_block_start, block_id, {in_block_i, in_block_j}, u); + res[0] += in_block_rank[0]; + res[1] += in_block_rank[1]; } break; case hyb_sd_blocktype::FULL: @@ -982,11 +1139,9 @@ class hyb_sd_vector if (in_block_j >= u) { res[0] += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); } else { - res[0] += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); - res[1] += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); - // auto in_block_rank = hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, {in_block_i, in_block_j}, u); - // res[0] += in_block_rank[0]; - // res[1] += in_block_rank[1]; + auto in_block_rank = hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, {in_block_i, in_block_j}, u); + res[0] += in_block_rank[0]; + res[1] += in_block_rank[1]; } break; } From 23bfeebe37760b9c783240fadf7ab72d23af3005 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Sun, 8 May 2016 16:29:32 +1000 Subject: [PATCH 10/21] Handling rare characters --- include/sdsl/csa_sada2.hpp | 147 ++++++++++++++++++++++++++++++------- 1 file changed, 121 insertions(+), 26 deletions(-) diff --git a/include/sdsl/csa_sada2.hpp b/include/sdsl/csa_sada2.hpp index 30fcd5bf4..e1b540da5 100644 --- a/include/sdsl/csa_sada2.hpp +++ b/include/sdsl/csa_sada2.hpp @@ -65,6 +65,11 @@ class uef_psi_support std::vector m_inc_seq; std::vector m_inc_seq_rank; std::vector m_inc_seq_sel; + bit_vector m_sml; // indicates if a context is small or large + rank_support_v5<> m_sml_rank; // rank for m_sml + wt_huff_int<> m_sml_wt; // wt to get rank to index into + std::vector> m_sml_inc_seq;// small sequences + const t_csa* m_csa; void set_vector() @@ -87,24 +92,56 @@ class uef_psi_support g_saved_bits=0; set_vector(csa); const auto& C = m_csa->C; -// std::cout<<"C.size()="< sml(sigma_small, 0, bits::hi(threshold)+1); + +// (2) Create a vector containing only the small context sizes + for (size_t i=0, ii=0; i(size, 0, bits::hi(m_csa->size())+1); + } + +// (5) Initialize m_inc_seq (to store the larger contexts + m_inc_seq.resize(sigma_large); + m_inc_seq_rank.resize(sigma_large); + m_inc_seq_sel.resize(sigma_large); + for (size_t i=0,i0=0,i1=0; i v(C[i+1]-C[i]); - //bit_vector bv(size(), 0); for (size_t j=C[i]; jC[cc+1] - m_csa->C[cc]; // context size + auto rank = m_sml_wt.rank(cc_sml, cs); + size_type begin = rank*cs; + for (size_t j=0; j= i) + return j; + } + return cs; + } else { + size_type cc_large = cc - m_sml_rank(cc); + return m_inc_seq_rank[cc_large](i); + } } std::array rank(std::array ij, comp_char_type cc) const { - return m_inc_seq_rank[cc](ij); + if (m_sml[cc]) { + auto cc_sml = m_sml_rank(cc); + size_type cs = m_csa->C[cc+1] - m_csa->C[cc]; // context size + auto rank = m_sml_wt.rank(cc_sml, cs); + size_type begin = rank*cs; + std::array res = {0,0}; + size_t j=0; + for (size_t k=0; k<2; ++k) { + while (j < cs and m_sml_inc_seq[cs][begin+j] < ij[k]) { + ++j; + } + res[k] = j; + } + return res; + } else { + size_type cc_large = cc - m_sml_rank(cc); + return m_inc_seq_rank[cc_large](ij); + } } uint64_t select(uint64_t i, comp_char_type cc) const { - return m_inc_seq_sel[cc](i); + if (m_sml[cc]) { + auto cc_sml = m_sml_rank(cc); + size_type cs = m_csa->C[cc+1] - m_csa->C[cc]; // context size + auto rank = m_sml_wt.rank(cc_sml, cs); + return m_sml_inc_seq[cs][rank*cs+(i-1)]; + } else { + size_type cc_large = cc - m_sml_rank(cc); + return m_inc_seq_sel[cc_large](i); + } } value_type operator[](const size_type i) const { size_t cc = std::upper_bound(m_csa->C.begin(), m_csa->C.end(),i) - m_csa->C.begin() - 1; size_t cum_sum = m_csa->C[cc]; - /* if (50929==i){ - std::cout<sigma="<<(size_t)m_csa->sigma<C[cc+1]-1<<"] of size "<C[cc+1]-cum_sum< Date: Sun, 8 May 2016 16:52:33 +1000 Subject: [PATCH 11/21] Removed unnecessary select in WT --- include/sdsl/csa_sada2.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/sdsl/csa_sada2.hpp b/include/sdsl/csa_sada2.hpp index e1b540da5..193ba2337 100644 --- a/include/sdsl/csa_sada2.hpp +++ b/include/sdsl/csa_sada2.hpp @@ -60,6 +60,10 @@ class uef_psi_support typedef ptrdiff_t difference_type; typedef csa_member_tag category; typedef int_alphabet_tag alphabet_category; + typedef wt_huff_int, + select_support_scan<0>> sml_wt_type; private: std::vector m_inc_seq; @@ -67,7 +71,7 @@ class uef_psi_support std::vector m_inc_seq_sel; bit_vector m_sml; // indicates if a context is small or large rank_support_v5<> m_sml_rank; // rank for m_sml - wt_huff_int<> m_sml_wt; // wt to get rank to index into + sml_wt_type m_sml_wt; // wt to get rank to index into std::vector> m_sml_inc_seq;// small sequences const t_csa* m_csa; From 163c6721c58cfe849432b72239a6065142fd1353 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Fri, 13 May 2016 18:10:03 +0200 Subject: [PATCH 12/21] Fixed case where i>m_size --- include/sdsl/csa_sada2.hpp | 22 +++++++++++---- include/sdsl/hyb_sd_vector.hpp | 51 ++++++++++++++++++++++++++++++---- 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/include/sdsl/csa_sada2.hpp b/include/sdsl/csa_sada2.hpp index 193ba2337..545dd7f38 100644 --- a/include/sdsl/csa_sada2.hpp +++ b/include/sdsl/csa_sada2.hpp @@ -61,9 +61,9 @@ class uef_psi_support typedef csa_member_tag category; typedef int_alphabet_tag alphabet_category; typedef wt_huff_int, select_support_scan<1>, - select_support_scan<0>> sml_wt_type; + select_support_scan<0>> sml_wt_type; private: std::vector m_inc_seq; @@ -200,6 +200,7 @@ class uef_psi_support } return cs; } else { +// std::cout<<"single_rank: for i="<C[cc+1] - m_csa->C[cc]; // context size - auto rank = m_sml_wt.rank(cc_sml, cs); - size_type begin = rank*cs; + auto rnk = m_sml_wt.rank(cc_sml, cs); + size_type begin = rnk*cs; std::array res = {0,0}; size_t j=0; for (size_t k=0; k<2; ++k) { @@ -220,10 +221,21 @@ class uef_psi_support } res[k] = j; } +// std::array res2 = {rank(ij[0],cc),rank(ij[1],cc)}; +// if ( res != res2 ){ +// std::cout<<"double rank: res=["< res2 = {rank(ij[0],cc),rank(ij[1],cc)}; +// std::cout<<"_double rank: res=["<& block_start, size_type block_id, size_type i, size_type u) { +//std::cout<<">>>>>>>>rank_1("<>sel_high "< m_size or m_num_ones == 0) { +//std::cout<<"!!! i > m_size "< "<(m_block_type[block_id]); if (block_type == hyb_sd_blocktype::FULL) { +//std::cout<<"!!! hdb_sd_blocktype::FULL"<::rank_1(m_bottom, m_block_start, block_id, in_block_i, 0); } @@ -1045,19 +1076,19 @@ class hyb_sd_vector switch (block_type) { case hyb_sd_blocktype::BV: -// std::cout << "BV" << std::endl; +// std::cout << "!!!BV" << std::endl; res += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; case hyb_sd_blocktype::EF: -// std::cout << "EF" << std::endl; +// std::cout << "!!!single EF in_block_i="<::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; case hyb_sd_blocktype::FULL: -// std::cout << "FULL" << std::endl; +// std::cout << "!!!FULL" << std::endl; res += hyb_sd_block_full::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; case hyb_sd_blocktype::RL: -// std::cout << "RL" << std::endl; +// std::cout << "!!!RL" << std::endl; res += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; } @@ -1074,6 +1105,9 @@ class hyb_sd_vector if (ij[0] > m_size or m_num_ones == 0) { return {m_num_ones, m_num_ones}; } + if (ij[1] > m_size or m_num_ones == 0) { + return {rank_1(ij[0]), m_num_ones}; + } auto block_id = m_top_rank(ij[0]); if (block_id == 0) { size_type first_element = m_top_sel(1); @@ -1112,6 +1146,7 @@ class hyb_sd_vector switch (block_type) { case hyb_sd_blocktype::BV: +// std::cout<<"double rank_1 for BV"<= u) { res[0] += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); } else { @@ -1121,21 +1156,25 @@ class hyb_sd_vector } break; case hyb_sd_blocktype::EF: +// std::cout<<"double rank_1 for EF"<= u) { res[0] += hyb_sd_block_ef::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); } else { +// std::cout<<"call double"<::rank_1(m_bottom, m_block_start, block_id, {in_block_i, in_block_j}, u); res[0] += in_block_rank[0]; res[1] += in_block_rank[1]; } break; case hyb_sd_blocktype::FULL: +// std::cout<<"double rank_1 for FULL"<::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); if (in_block_j < u) { res[1] += hyb_sd_block_full::rank_1(m_bottom, m_block_start, block_id, in_block_j, u); } break; case hyb_sd_blocktype::RL: +// std::cout<<"double rank_1 for RL"<= u) { res[0] += hyb_sd_block_rl::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); } else { From 4fb60d3c4fff034ea5d2dc4906f58f44a381b950 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Sat, 14 May 2016 08:37:16 +0200 Subject: [PATCH 13/21] fixed alignment error --- include/sdsl/csa_sada.hpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/include/sdsl/csa_sada.hpp b/include/sdsl/csa_sada.hpp index cec3d5db9..b3c5bfa60 100644 --- a/include/sdsl/csa_sada.hpp +++ b/include/sdsl/csa_sada.hpp @@ -298,28 +298,42 @@ class csa_sada template size_type rank_comp_bwt(size_type i, const t_char cc)const { +// std::cout<<"rank_comp_bwt("< search in previous block (s_begin-1) +// std::cout<<"case (1)"<= i) { // now s_begin < s_end // Case (2): Some samples inside [cc_begin, cc_end) // and first sample already larger or equal to i // => search in previous block (s_begin-1) +// std::cout<<"case (2): "<= " << i << std::endl; } else { // still s_begin < s_end // Case (3): Some samples inside [cc_begin, cc_end) // and first sample smaller than i // => binary search for first sample >= i s_begin = upper_bound(s_begin, s_end, i-1); // => search in previous block (s_begin-1) +// std::cout<<"case (3): s_begin = " << s_begin << " (s_end=" << s_end <<" )"<< std::endl; +// std::cout<<">>>>> m_psi.sample(s_begin-1)="< Date: Mon, 16 May 2016 00:13:31 +0200 Subject: [PATCH 14/21] don't use double_rank --- include/sdsl/suffix_array_algorithm.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/sdsl/suffix_array_algorithm.hpp b/include/sdsl/suffix_array_algorithm.hpp index 0896e86dc..51bd18430 100644 --- a/include/sdsl/suffix_array_algorithm.hpp +++ b/include/sdsl/suffix_array_algorithm.hpp @@ -183,11 +183,11 @@ typename t_csa::size_type backward_search( l_res = c_begin; r_res = csa.C[cc+1] - 1; } else { - auto lr = csa.bwt.rank(std::array {l,r+1},c); - l_res = c_begin + lr[0]; // count c in bwt[0..l-1] - r_res = c_begin + lr[1] - 1; // count c in bwt[0..r] -// l_res = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1] -// r_res = c_begin + csa.bwt.rank(r+1, c) - 1; // count c in bwt[0..r] +// auto lr = csa.bwt.rank(std::array {l,r+1},c); +// l_res = c_begin + lr[0]; // count c in bwt[0..l-1] +// r_res = c_begin + lr[1] - 1; // count c in bwt[0..r] + l_res = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1] + r_res = c_begin + csa.bwt.rank(r+1, c) - 1; // count c in bwt[0..r] } } assert(r_res+1-l_res >= 0); From 1e2b62737702e25e34f34a8fa4bbb652c691e136 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Sat, 30 Jul 2016 12:45:01 +0200 Subject: [PATCH 15/21] Fixed error in construction --- include/sdsl/hyb_sd_vector.hpp | 27 +++++++--- test/bit_vector_test.cpp | 18 +++++-- test/rank_support_test.cpp | 93 ++++++++++++++++++---------------- test/select_support_test.cpp | 61 +++++++++++----------- 4 files changed, 114 insertions(+), 85 deletions(-) diff --git a/include/sdsl/hyb_sd_vector.hpp b/include/sdsl/hyb_sd_vector.hpp index 1b73d9977..ea4f1d0e7 100644 --- a/include/sdsl/hyb_sd_vector.hpp +++ b/include/sdsl/hyb_sd_vector.hpp @@ -778,8 +778,7 @@ class hyb_sd_vector static constexpr uint16_t block_size = t_block_size; private: - enum class hyb_sd_blocktype : uint8_t - { + enum class hyb_sd_blocktype : uint8_t { EF=0, BV=1, FULL=2, @@ -870,7 +869,8 @@ class hyb_sd_vector if (bv[i] == 1) { last_one = i; if (++one_found == t_block_size) { - top_lvl.push_back(i-t_block_size*top_lvl.size()); + top_lvl.push_back(i); + //top_lvl.push_back(i-t_block_size*top_lvl.size()); one_found = 0; } } @@ -999,7 +999,9 @@ class hyb_sd_vector value_type operator[](size_type i)const { - return rank_1(i+1) - rank_1(i); + auto ranks = rank_1({{i+1,i}}); + return ranks[1]-ranks[0]; +// return rank_1(i+1) - rank_1(i); } //! Accessing the i-th element of the original bit_vector @@ -1042,6 +1044,10 @@ class hyb_sd_vector size_type rank_1(size_type i) const { +// bool debug = false; +// if ( i==2075 or i==2076) { +// debug = true; +// } //std::cout<<"!!! rank_1("< m_size or m_num_ones == 0) { //std::cout<<"!!! i > m_size "< "<(m_block_type[block_id]); +//if (debug) { +// std::cout<<"!!! i="<::rank_1(m_bottom, m_block_start, block_id, in_block_i, 0); } @@ -1080,7 +1090,9 @@ class hyb_sd_vector res += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; case hyb_sd_blocktype::EF: -// std::cout << "!!!single EF in_block_i="< Date: Tue, 15 Nov 2016 08:38:12 +0100 Subject: [PATCH 17/21] Fix and debug information --- include/sdsl/hyb_sd_vector.hpp | 29 ++++++++++++++++++++++------- lib/hyb_sd_vector.cpp | 4 ++++ test/bit_vector_test.cpp | 4 +++- test/rank_support_test.cpp | 5 +++-- test/select_support_test.cpp | 3 +++ 5 files changed, 35 insertions(+), 10 deletions(-) create mode 100644 lib/hyb_sd_vector.cpp diff --git a/include/sdsl/hyb_sd_vector.hpp b/include/sdsl/hyb_sd_vector.hpp index 1b73d9977..c7e4e97b7 100644 --- a/include/sdsl/hyb_sd_vector.hpp +++ b/include/sdsl/hyb_sd_vector.hpp @@ -33,7 +33,7 @@ namespace sdsl { //std::vector g_range_stats; -size_t g_saved_bits=0; +extern size_t g_saved_bits; template std::string print_vec(t_itr beg, t_itr end) @@ -778,8 +778,7 @@ class hyb_sd_vector static constexpr uint16_t block_size = t_block_size; private: - enum class hyb_sd_blocktype : uint8_t - { + enum class hyb_sd_blocktype : uint8_t { EF=0, BV=1, FULL=2, @@ -870,7 +869,8 @@ class hyb_sd_vector if (bv[i] == 1) { last_one = i; if (++one_found == t_block_size) { - top_lvl.push_back(i-t_block_size*top_lvl.size()); + top_lvl.push_back(i); + //top_lvl.push_back(i-t_block_size*top_lvl.size()); one_found = 0; } } @@ -999,7 +999,9 @@ class hyb_sd_vector value_type operator[](size_type i)const { - return rank_1(i+1) - rank_1(i); + auto ranks = rank_1({{i+1,i}}); + return ranks[1]-ranks[0]; +// return rank_1(i+1) - rank_1(i); } //! Accessing the i-th element of the original bit_vector @@ -1042,6 +1044,10 @@ class hyb_sd_vector size_type rank_1(size_type i) const { +// bool debug = false; +// if ( i==2075 or i==2076) { +// debug = true; +// } //std::cout<<"!!! rank_1("< m_size or m_num_ones == 0) { //std::cout<<"!!! i > m_size "< "<(m_block_type[block_id]); +//if (debug) { +// std::cout<<"!!! i="<::rank_1(m_bottom, m_block_start, block_id, in_block_i, 0); } @@ -1080,7 +1090,9 @@ class hyb_sd_vector res += hyb_sd_block_bv::rank_1(m_bottom, m_block_start, block_id, in_block_i, u); break; case hyb_sd_blocktype::EF: -// std::cout << "!!!single EF in_block_i="<, +hyb_sd_vector<> +/*bit_vector_il<64>, bit_vector_il<128>, bit_vector_il<256>, bit_vector_il<512>, @@ -36,6 +37,7 @@ rrr_vector<128>, sd_vector<>, sd_vector >, hyb_vector<> +*/ > Implementations; diff --git a/test/rank_support_test.cpp b/test/rank_support_test.cpp index b73004877..68966ef11 100644 --- a/test/rank_support_test.cpp +++ b/test/rank_support_test.cpp @@ -17,7 +17,8 @@ class rank_support_test : public ::testing::Test { }; using testing::Types; typedef Types, - rank_support_il<1, 512>, + hyb_sd_vector<>::rank_1_type +/* rank_support_il<1, 512>, rank_support_il<1, 1024>, rank_support_rrr<>, rank_support_v<>, @@ -61,7 +62,7 @@ typedef Types, rank_support_v5<10,2>, rank_support_v5<01,2>, rank_support_v5<00,2>, - rank_support_v5<11,2> + rank_support_v5<11,2>*/ > Implementations; TYPED_TEST_CASE(rank_support_test, Implementations); diff --git a/test/select_support_test.cpp b/test/select_support_test.cpp index 9c305c453..7f00231b8 100644 --- a/test/select_support_test.cpp +++ b/test/select_support_test.cpp @@ -17,6 +17,8 @@ class select_support_test : public ::testing::Test { }; using testing::Types; typedef Types, + hyb_sd_vector<>::select_1_type +/* select_support_rrr<1, 256>, select_support_rrr<1, 129>, select_support_rrr<1, 192>, @@ -46,6 +48,7 @@ typedef Types, select_support_mcl<10,2>, select_support_mcl<00,2>, select_support_mcl<11,2> +*/ > Implementations; TYPED_TEST_CASE(select_support_test, Implementations); From c158125a4e700f092c60ebf1ac7d5d6b784b217d Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Thu, 14 Dec 2017 23:43:02 +0100 Subject: [PATCH 18/21] csa_sada2 --- include/sdsl/bits.hpp | 25 +++-- include/sdsl/config.hpp | 25 +++++ include/sdsl/csa_alphabet_strategy.hpp | 63 ++---------- include/sdsl/csa_sada.hpp | 15 ++- include/sdsl/csa_sada2.hpp | 130 ++++++++++--------------- include/sdsl/enc_vector2.hpp | 15 --- include/sdsl/hyb_sd_vector.hpp | 64 +++++++----- include/sdsl/structure_tree.hpp | 1 + include/sdsl/uint128_t.hpp | 2 +- test/csa_byte_test.cpp.cmake | 1 - test/csa_byte_test.typedef | 1 + test/csa_int_test.typedef | 1 + 12 files changed, 156 insertions(+), 187 deletions(-) diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index 4e152faa3..94caa6cb2 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -11,12 +11,15 @@ #include // for uint64_t uint32_t declaration #include // for cerr #include -#ifdef __SSE4_2__ + +// clang-format off +#if 1 #include #endif -#ifdef __BMI2__ +#if 0 #include #endif +// clang-format on #ifdef WIN32 #include "iso646.h" @@ -490,9 +493,11 @@ struct bits_impl { template inline uint64_t bits_impl::cnt(uint64_t x) { -#ifdef __SSE4_2__ +// clang-format off +#if 1 return __builtin_popcountll(x); #else +// clang-format on #ifdef POPCOUNT_TL return lt_cnt[x & 0xFFULL] + lt_cnt[(x >> 8) & 0xFFULL] + lt_cnt[(x >> 16) & 0xFFULL] + lt_cnt[(x >> 24) & 0xFFULL] + lt_cnt[(x >> 32) & 0xFFULL] + lt_cnt[(x >> 40) & 0xFFULL] + @@ -579,11 +584,13 @@ inline uint64_t bits_impl::map01(uint64_t x, uint64_t c) template inline uint32_t bits_impl::sel(uint64_t x, uint32_t i) { -#ifdef __BMI2__ +// clang-format off +#if 0 // taken from folly return _tzcnt_u64(_pdep_u64(1ULL << (i - 1), x)); #endif -#ifdef __SSE4_2__ +#if 1 + // clang-format on uint64_t s = x, b; s = s - ((s >> 1) & 0x5555555555555555ULL); s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL); @@ -653,10 +660,12 @@ inline uint32_t bits_impl::_sel(uint64_t x, uint32_t i) template inline uint32_t bits_impl::hi(uint64_t x) { -#ifdef __SSE4_2__ +// clang-format off +#if 1 if (x == 0) return 0; return 63 - __builtin_clzll(x); #else + // clang-format on uint64_t t, tt; // temporaries if ((tt = x >> 32)) { // hi >= 32 if ((t = tt >> 16)) { // hi >= 48 @@ -679,10 +688,12 @@ inline uint32_t bits_impl::hi(uint64_t x) template inline uint32_t bits_impl::lo(uint64_t x) { -#ifdef __SSE4_2__ +// clang-format off +#if 1 if (x == 0) return 0; return __builtin_ctzll(x); #else + // clang-format on if (x & 1) return 0; if (x & 3) return 1; if (x & 7) return 2; diff --git a/include/sdsl/config.hpp b/include/sdsl/config.hpp index bd45b9c87..c2a59fda8 100644 --- a/include/sdsl/config.hpp +++ b/include/sdsl/config.hpp @@ -124,6 +124,31 @@ using key_text_trait = key_text_trait_impl; template using key_bwt_trait = key_bwt_trait_impl; +template +constexpr const char* key_text() +{ + return conf::KEY_TEXT_INT; +} + +template +constexpr const char* key_bwt() +{ + return conf::KEY_BWT_INT; +} + + +template<> +inline constexpr const char* key_text<8>() +{ + return conf::KEY_TEXT; +} + +template<> +inline constexpr const char* key_bwt<8>() +{ + return conf::KEY_BWT; +} + } #endif diff --git a/include/sdsl/csa_alphabet_strategy.hpp b/include/sdsl/csa_alphabet_strategy.hpp index 6e147850b..752bfddaf 100644 --- a/include/sdsl/csa_alphabet_strategy.hpp +++ b/include/sdsl/csa_alphabet_strategy.hpp @@ -69,31 +69,6 @@ template, > class int_alphabet; -template -constexpr const char* key_text() -{ - return conf::KEY_TEXT_INT; -} - -template -constexpr const char* key_bwt() -{ - return conf::KEY_BWT_INT; -} - - -template<> -inline constexpr const char* key_text<8>() -{ - return conf::KEY_TEXT; -} - -template<> -inline constexpr const char* key_bwt<8>() -{ - return conf::KEY_BWT; -} - template struct alphabet_trait { typedef byte_alphabet type; @@ -465,16 +440,6 @@ class succinct_byte_alphabet return *this; } - //! Swap operator - void swap(succinct_byte_alphabet& strat) - { - m_char.swap(strat.m_char); - util::swap_support(m_char_rank, strat.m_char_rank, &m_char, &(strat.m_char)); - util::swap_support(m_char_select, strat.m_char_select, &m_char, &(strat.m_char)); - m_C.swap(strat.m_C); - std::swap(m_sigma,strat.m_sigma); - } - //! Serialize method size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const { @@ -619,12 +584,6 @@ class succinct_multibyte_alphabet return multi_C[static_cast(c)]; } - void swap(multibyte_C& t) - { - C.swap(t.C); - multi_C.swap(t.multi_C); - } - //! Serialize method size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const { @@ -796,18 +755,6 @@ class succinct_multibyte_alphabet return *this; } - //! Swap operator - void swap(succinct_multibyte_alphabet& strat) - { - m_char.swap(strat.m_char); - util::swap_support(m_char_rank, strat.m_char_rank, &m_char, &(strat.m_char)); - util::swap_support(m_char_select, strat.m_char_select, &m_char, &(strat.m_char)); - m_C.swap(strat.m_C); - std::swap(m_sigma,strat.m_sigma); - std::swap(m_sigma_q,strat.m_sigma_q); - std::swap(m_sigma_q_1,strat.m_sigma_q_1); - } - //! Serialize method size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const { @@ -972,7 +919,15 @@ class int_alphabet if (is_continuous_alphabet(D)) { // do not initialize m_char, m_char_rank and m_char_select since we can map directly } else { - init_char_bitvector(m_char, D); + // note: the alphabet has at least size 1, so the following is safe: + size_type largest_symbol = (--D.end())->first; + bit_vector tmp_char(largest_symbol+1, 0); + for (std::map::const_iterator it = D.begin(), end=D.end(); it != end; ++it) { + tmp_char[it->first] = 1; + } + m_char = tmp_char; + util::init_support(m_char_rank, &m_char); + util::init_support(m_char_select, &m_char); } assert(D.find(0) != D.end() and 1 == D[0]); // null-byte should occur exactly once diff --git a/include/sdsl/csa_sada.hpp b/include/sdsl/csa_sada.hpp index a1c0f16b6..8451c9656 100644 --- a/include/sdsl/csa_sada.hpp +++ b/include/sdsl/csa_sada.hpp @@ -483,17 +483,16 @@ csa_sada::KEY_BWT, config)) { + if (!cache_file_exists(key_bwt(), config)) { return; } - int_vector_buffer bwt_buf(cache_file_name(key_trait::KEY_BWT,config)); + int_vector_buffer bwt_buf(cache_file_name(key_bwt(),config)); size_type n = bwt_buf.size(); { auto event = memory_monitor::event("construct csa-alpbabet"); // alphabet_type tmp_alphabet(bwt_buf, n); // TODO: maybe it is possible to use _buf_buf again for multibyte!! - int_vector_buffer text_buf(cache_file_name(key_trait::KEY_TEXT,config)); - alphabet_type tmp_alphabet(text_buf, n); - m_alphabet.swap(tmp_alphabet); + int_vector_buffer text_buf(cache_file_name(key_text(),config)); + m_alphabet = alphabet_type(text_buf, n); } int_vector<> cnt_chr(sigma, 0, bits::hi(n)+1); @@ -516,8 +515,7 @@ cache_config& config) { auto event = memory_monitor::event("encode PSI"); int_vector_buffer<> psi_buf(cache_file_name(conf::KEY_PSI, config)); - t_enc_vec tmp_psi(psi_buf); - m_psi.swap(tmp_psi); + m_psi = t_enc_vec(psi_buf); /* enc_vector m_psi_check(psi_buf); if ( m_psi_check.size() != m_psi.size() ){ @@ -552,8 +550,7 @@ cache_config& config) } { auto event = memory_monitor::event("sample SA"); - sa_sample_type tmp_sa_sample(config); - m_sa_sample.swap(tmp_sa_sample); + m_sa_sample = sa_sample_type(config); } { auto event = memory_monitor::event("sample ISA"); diff --git a/include/sdsl/csa_sada2.hpp b/include/sdsl/csa_sada2.hpp index 545dd7f38..e800ec069 100644 --- a/include/sdsl/csa_sada2.hpp +++ b/include/sdsl/csa_sada2.hpp @@ -69,14 +69,14 @@ class uef_psi_support std::vector m_inc_seq; std::vector m_inc_seq_rank; std::vector m_inc_seq_sel; - bit_vector m_sml; // indicates if a context is small or large - rank_support_v5<> m_sml_rank; // rank for m_sml - sml_wt_type m_sml_wt; // wt to get rank to index into - std::vector> m_sml_inc_seq;// small sequences + bit_vector m_sml; // indicates if a context is small or large + rank_support_v5<> m_sml_rank; // rank for m_sml + sml_wt_type m_sml_wt; // wt to get rank to index into + std::vector> m_sml_inc_seq; // small sequences const t_csa* m_csa; - void set_vector() + void set_inc_seq_rank_select() { for (size_t i=0; i& psi_buf, const t_csa* csa) { -// std::cout<<"Hello!!!!"<C; @@ -125,7 +123,7 @@ class uef_psi_support m_sml_inc_seq[cs] = int_vector<>(size, 0, bits::hi(m_csa->size())+1); } -// (5) Initialize m_inc_seq (to store the larger contexts +// (5) Initialize m_inc_seq (to store the larger contexts) m_inc_seq.resize(sigma_large); m_inc_seq_rank.resize(sigma_large); m_inc_seq_sel.resize(sigma_large); @@ -141,11 +139,10 @@ class uef_psi_support m_sml_inc_seq[v.size()][j+start_pos] = v[j]; } } else { - t_hyb_vec tmp(v.begin(), v.end()); - m_inc_seq[i0++].swap(tmp); + m_inc_seq[i0++] = t_hyb_vec(v.begin(), v.end()); } } - set_vector(); + set_inc_seq_rank_select(); } uef_psi_support& operator=(const uef_psi_support& psi) @@ -159,7 +156,7 @@ class uef_psi_support m_sml_rank.set_vector(&m_sml); m_sml_wt = psi.m_sml_wt; m_sml_inc_seq = psi.m_sml_inc_seq; - set_vector(); + set_inc_seq_rank_select(); set_vector(psi.m_csa); } return *this; @@ -168,6 +165,7 @@ class uef_psi_support uef_psi_support& operator=(uef_psi_support&& psi) { if (this != &psi) { + set_vector(psi.m_csa); m_inc_seq = std::move(psi.m_inc_seq); m_inc_seq_rank = std::move(psi.m_inc_seq_rank); m_inc_seq_sel = std::move(psi.m_inc_seq_sel); @@ -176,8 +174,7 @@ class uef_psi_support m_sml_rank.set_vector(&m_sml); m_sml_wt = std::move(psi.m_sml_wt); m_sml_inc_seq = std::move(psi.m_sml_inc_seq); - set_vector(); - set_vector(psi.m_csa); + set_inc_seq_rank_select(); } return *this; } @@ -254,15 +251,27 @@ class uef_psi_support value_type operator[](const size_type i) const { +// std::cout<<"call::psi["<C.begin(), m_csa->C.end(),i) - m_csa->C.begin() - 1; +// std::cout<<"cc="<C[cc]; +// std::cout<<"cum_sum="< csa_sada2::csa_sada2(cache_config& config) { - if (!cache_file_exists(key_trait::KEY_BWT, config)) { + if (!cache_file_exists(key_bwt(), config)) { return; } - int_vector_buffer bwt_buf(cache_file_name(key_trait::KEY_BWT,config)); + int_vector_buffer bwt_buf(cache_file_name(key_bwt(),config)); size_type n = bwt_buf.size(); { auto event = memory_monitor::event("construct csa-alpbabet"); // alphabet_type tmp_alphabet(bwt_buf, n); // TODO: maybe it is possible to use _buf_buf again for multibyte!! - int_vector_buffer text_buf(cache_file_name(key_trait::KEY_TEXT,config)); - alphabet_type tmp_alphabet(text_buf, n); - m_alphabet.swap(tmp_alphabet); + int_vector_buffer text_buf(cache_file_name(key_text(),config)); + m_alphabet = alphabet_type(text_buf, n); } int_vector<> cnt_chr(sigma, 0, bits::hi(n)+1); @@ -677,8 +654,7 @@ csa_sada2::value_type enc_vector2 -void enc_vector2::swap(enc_vector2& v) -{ - if (this != &v) { // if v and _this_ are not the same object - m_z.swap(v.m_z); - m_samples.swap(v.m_samples); - m_pointers.swap(v.m_pointers); - util::swap_support(m_pointers_sel, v.m_pointers_sel, &m_pointers, &v.m_pointers); - std::swap(m_size, v.m_size); - } -} - template template enc_vector2::enc_vector2(int_vector_buffer& v_buf) diff --git a/include/sdsl/hyb_sd_vector.hpp b/include/sdsl/hyb_sd_vector.hpp index c7e4e97b7..47a80a87f 100644 --- a/include/sdsl/hyb_sd_vector.hpp +++ b/include/sdsl/hyb_sd_vector.hpp @@ -32,9 +32,6 @@ namespace sdsl { -//std::vector g_range_stats; -extern size_t g_saved_bits; - template std::string print_vec(t_itr beg, t_itr end) { @@ -261,10 +258,6 @@ class hyb_sd_block_rl if (end-begin > 1) { // rl_bits += do_encode(1); // rl_bits += do_encode(end-1-begin); - if (bv != nullptr) { - g_saved_bits += t_coder::encoding_length(1); - g_saved_bits += t_coder::encoding_length(end-1-begin); - } } /* if ( bv!=nullptr ) { @@ -841,6 +834,47 @@ class hyb_sd_vector hyb_sd_vector() {} + hyb_sd_vector(const hyb_sd_vector& vec) { + *this = vec; + } + + hyb_sd_vector(hyb_sd_vector&& vec) { + *this = std::move(vec); + } + + hyb_sd_vector& operator=(const hyb_sd_vector& vec){ + if (this != &vec) { + m_top = vec.m_top; + m_top_sel = vec.m_top_sel; + m_top_sel.set_vector(&m_top); + m_top_rank = vec.m_top_rank; + m_top_rank.set_vector(&m_top); + m_bottom = vec.m_bottom; + m_block_start = vec.m_block_start; + m_block_type = vec.m_block_type; + m_size = vec.m_size; + m_num_ones = vec.m_num_ones; + } + return *this; + } + + + hyb_sd_vector& operator=(hyb_sd_vector&& vec){ + if (this != &vec) { + m_top = std::move(vec.m_top); + m_top_sel = std::move(vec.m_top_sel); + m_top_sel.set_vector(&m_top); + m_top_rank = std::move(vec.m_top_rank); + m_top_rank.set_vector(&m_top); + m_bottom = std::move(vec.m_bottom); + m_block_start = std::move(vec.m_block_start); + m_block_type = std::move(vec.m_block_type); + m_size = vec.m_size; + m_num_ones = vec.m_num_ones; + } + return *this; + } + ///* explicit hyb_sd_vector(const bit_vector& bv) //: hyb_sd_vector(bv.ones_begin(),bv.ones_end(),bv.size()) { @@ -1268,18 +1302,6 @@ class hyb_sd_vector m_block_type.load(in); } - void swap(hyb_sd_vector& v) - { - std::swap(m_size, v.m_size); - std::swap(m_num_ones, v.m_num_ones); - m_top.swap(v.m_top); - util::swap_support(m_top_sel, v.m_top_sel, &m_top, &(v.m_top)); - util::swap_support(m_top_rank, v.m_top_rank, &m_top, &(v.m_top)); - m_bottom.swap(v.m_bottom); - m_block_start.swap(v.m_block_start); - m_block_type.swap(v.m_block_type); - } - iterator begin() const { return iterator(this, 0); @@ -1340,8 +1362,6 @@ class select_support_hyb_sd return *this; } - void swap(select_support_hyb_sd&) {} - void load(std::istream&, const hyb_bv_type* v = nullptr) { set_vector(v); @@ -1404,8 +1424,6 @@ class rank_support_hyb_sd return *this; } - void swap(rank_support_hyb_sd&) {} - void load(std::istream&, const hyb_bv_type* v = nullptr) { set_vector(v); diff --git a/include/sdsl/structure_tree.hpp b/include/sdsl/structure_tree.hpp index ecf3dc303..74c78bf9d 100644 --- a/include/sdsl/structure_tree.hpp +++ b/include/sdsl/structure_tree.hpp @@ -121,6 +121,7 @@ inline std::string create_html_header(const char* file_name) << " \n" << " \n" << " " << file_name << "\n" + << " " << " \n" << "