Skip to content

Commit

Permalink
Add docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
weiliw-amz committed Jul 20, 2023
1 parent 125894e commit 8d1eb86
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 36 deletions.
7 changes: 6 additions & 1 deletion pecos/core/third_party/ankerl/unordered_dense.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.


// NOTE: Ankerl mmap is modified for memory-mappable functionality and cannot be used in the original way
// Check utils/mmap_hashmap.hpp for usage


#ifndef ANKERL_UNORDERED_DENSE_H
#define ANKERL_UNORDERED_DENSE_H

Expand Down Expand Up @@ -88,7 +93,7 @@
# include <type_traits> // for enable_if_t, declval, conditional_t, ena...
# include <utility> // for forward, exchange, pair, as_const, piece...
# include <vector> // for vector
# include "../../utils/mmap_util.hpp" // MODIFIED for mmap
# include "utils/mmap_util.hpp" // MODIFIED for mmap
# if ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() == 0
# include <cstdlib> // for abort
# endif
Expand Down
10 changes: 8 additions & 2 deletions pecos/core/utils/mmap_hashmap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@
#ifndef __MMAP_ANKERL_HASHMAP_H__
#define __MMAP_ANKERL_HASHMAP_H__

#include "../third_party/ankerl/unordered_dense.h"
#include "third_party/ankerl/unordered_dense.h"
#include "mmap_util.hpp"

namespace pecos {
namespace mmap_hashmap {

namespace details_ { // namespace for Module Private classes

/* For all memory-mappable vectors, when calling write functions,
assume the underlying storage is in memory, otherwise the code will fail. */

// Memory-mappable vector of std::pair<StrView, uint64_t> for Ankerl
// This vector takes/gets std::string_view as the key, but emplace back as the special mmap format StrView
class AnkerlStr2IntMmapableVector {
Expand Down Expand Up @@ -71,6 +74,7 @@ class AnkerlStr2IntMmapableVector {
constexpr auto end() const -> const_iterator { return {data_ + size_}; }
constexpr auto cend() const -> const_iterator{ return {data_ + size_}; }

// ----- Write funcs start -----
void shrink_to_fit() { store_.shrink_to_fit(); }
void reserve(size_t new_capacity) { store_.reserve(new_capacity); }

Expand Down Expand Up @@ -101,6 +105,7 @@ class AnkerlStr2IntMmapableVector {
void pop_back() {
throw std::runtime_error("Not implemented for deletion");
}
// ----- Write funcs end -----

size_type size() const { return size_; }

Expand Down Expand Up @@ -222,7 +227,6 @@ class AnkerlStr2IntMmapableVector {


// Memory-mappable vector of std::pair<uint64_t, uint64_t> for Ankerl
// When calling write methods, the assumption is that the underlying storage is in memory, i.e. std::vector
class AnkerlInt2IntMmapableVector : public pecos::mmap_util::MmapableVector<std::pair<uint64_t, uint64_t>> {
template <bool IsConst>
class iter_t;
Expand Down Expand Up @@ -255,6 +259,7 @@ class AnkerlInt2IntMmapableVector : public pecos::mmap_util::MmapableVector<std:
constexpr auto end() const -> const_iterator { return {this->data_ + this->size_}; }
constexpr auto cend() const -> const_iterator{ return {this->data_ + this->size_}; }

// ----- Write funcs start -----
void shrink_to_fit() { this->store_.shrink_to_fit(); }
void reserve(size_t new_capacity) { this->store_.reserve(new_capacity); }

Expand All @@ -271,6 +276,7 @@ class AnkerlInt2IntMmapableVector : public pecos::mmap_util::MmapableVector<std:
this->size_ = this->store_.size();
this->data_ = this->store_.data();
}
// ----- Write funcs end -----

/* Get key for member */
key_type get_key(value_type const& vt) const {
Expand Down
100 changes: 67 additions & 33 deletions pecos/utils/mmap_hashmap_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,57 @@
# or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
# and limitations under the License.
import logging
from abc import abstractmethod
from ctypes import (
c_bool,
c_uint32,
c_uint64,
c_char_p,
c_void_p,
)
from pecos.core import clib
from typing import Optional


LOGGER = logging.getLogger(__name__)


class MmapHashmap(object):
"""
Python wrapper of Memory-mappable Hashmap
Python wrapper of Memory-mappable Hashmap, which is similar to Python Dict,
but provides memory-map save/load functionalities, so that user could write and dump to disk,
and later load read-only with memory-map techniques which allows out-of-core
access for large data that cannot fit into memory.
However, Memory-mappable Hashmap is not identical to Python Dict.
The major differences are:
* Only read-only and write-only modes are provided.
* In write-only mode, deleting key is not allowed. Can only do following operations:
* Insert key
* Overwrite value for an existing key
"""

def __init__(self, map_type):
def __init__(self, map_type: str):
if map_type not in clib.mmap_map_fn_dict:
raise NotImplementedError(f"map_type={map_type} is not implemented.")

self.map_type = map_type
self.map = None
self.mode = None
self.map_dir = None
self.mode: Optional[str] = None
self.map_dir: Optional[str] = None

def open(self, mode: str, map_dir: str):
"""
Open hashmap at given directory for read-only or write-only.
For write-only, the hashmap exists in RAM until dumps to given directory when closing.
def open(self, mode, map_dir):
args:
mode: Open mode, in "w", "r", "r_lazy"(not pre-load).
map_dir: Directory to load from/save to.
"""
if mode == "w":
map = _MmapHashmapWrite.init(self.map_type, map_dir)
LOGGER.info(f"Opened hashmap for writing. Will save to {map_dir} upon closing.")
elif mode == "r" or mode == "r_lazy":
lazy_load = True if mode == "r_lazy" else False
map = _MmapHashmapReadOnly.init(self.map_type, map_dir, lazy_load)
LOGGER.info(
f"Opened hashmap for read-only. Will {'NOT' if lazy_load else ''} pre-load."
)
else:
raise NotImplementedError(f"{mode} not implemented.")

Expand All @@ -47,12 +67,30 @@ def open(self, mode, map_dir):
self.map_dir = map_dir

def close(self):
"""
Close and destruct hashmap.
For write-only, dumps to given directory.
"""
if self.mode == "w":
self.map.save()
LOGGER.info(f"Saved hashmap to {self.map_dir} upon closing.")
self.map.destruct()
LOGGER.info("Destructed hashmap upon closing.")
self.map = None
self.mode = None
self.map_dir = None

def __del__(self):
"""
Destructor to call close() if not called previously
"""
if self.map is not None:
self.close()


class _MmapHashmapBase(object):
"""Base class for methods shared by all modes"""

def __init__(self, map_ptr, fn_dict):
self.map_ptr = map_ptr
self.fn_dict = fn_dict
Expand All @@ -65,6 +103,8 @@ def destruct(self):


class _MmapHashmapReadOnly(_MmapHashmapBase):
"""Base class for methods shared by all read modes"""

@abstractmethod
def get(self, key, default_val):
pass
Expand All @@ -80,7 +120,7 @@ def __contains__(self, key):
@classmethod
def init(cls, map_type, map_dir, lazy_load):
fn_dict = clib.mmap_hashmap_init(map_type)
map_ptr = fn_dict["load"](c_char_p(map_dir.encode("utf-8")), c_bool(lazy_load))
map_ptr = fn_dict["load"](map_dir.encode("utf-8"), lazy_load)

if map_type == "str2int":
return _MmapHashmapStr2IntReadOnly(map_ptr, fn_dict)
Expand All @@ -97,37 +137,33 @@ def get(self, key_utf8, default_val):
key_utf8: UTF8 encoded bytes string key
"""
return self.fn_dict["get_w_default"](
c_void_p(self.map_ptr),
c_char_p(key_utf8),
c_uint32(len(key_utf8)),
c_uint64(default_val),
self.map_ptr,
key_utf8,
len(key_utf8),
default_val,
)

def __getitem__(self, key_utf8):
return self.fn_dict["get"](
c_void_p(self.map_ptr), c_char_p(key_utf8), c_uint32(len(key_utf8))
)
return self.fn_dict["get"](self.map_ptr, key_utf8, len(key_utf8))

def __contains__(self, key_utf8):
return self.fn_dict["contains"](
c_void_p(self.map_ptr), c_char_p(key_utf8), c_uint32(len(key_utf8))
)
return self.fn_dict["contains"](self.map_ptr, key_utf8, len(key_utf8))


class _MmapHashmapInt2IntReadOnly(_MmapHashmapReadOnly):
def get(self, key, default_val):
return self.fn_dict["get_w_default"](
c_void_p(self.map_ptr), c_uint64(key), c_uint64(default_val)
)
return self.fn_dict["get_w_default"](self.map_ptr, key, default_val)

def __getitem__(self, key):
return self.fn_dict["get"](c_void_p(self.map_ptr), c_uint64(key))
return self.fn_dict["get"](self.map_ptr, key)

def __contains__(self, key):
return self.fn_dict["contains"](c_void_p(self.map_ptr), c_uint64(key))
return self.fn_dict["contains"](self.map_ptr, key)


class _MmapHashmapWrite(_MmapHashmapBase):
"""Base class for methods shared by all write modes"""

def __init__(self, map_ptr, fn_dict, map_dir):
super().__init__(map_ptr, fn_dict)
self.map_dir = map_dir
Expand All @@ -140,7 +176,7 @@ def save(self):
import pathlib

pathlib.Path(self.map_dir).mkdir(parents=True, exist_ok=True)
self.fn_dict["save"](self.map_ptr, c_char_p(self.map_dir.encode("utf-8")))
self.fn_dict["save"](self.map_ptr, (self.map_dir.encode("utf-8")))

@classmethod
def init(cls, map_type, map_dir):
Expand All @@ -162,11 +198,9 @@ def insert(self, key_utf8, val):
key_utf8 (bytes): UTF8 encoded bytes string key
val (int): Integer value
"""
self.fn_dict["insert"](
c_void_p(self.map_ptr), c_char_p(key_utf8), c_uint32(len(key_utf8)), c_uint64(val)
)
self.fn_dict["insert"](self.map_ptr, key_utf8, len(key_utf8), val)


class _MmapHashmapInt2IntWrite(_MmapHashmapWrite):
def insert(self, key, val):
self.fn_dict["insert"](c_void_p(self.map_ptr), c_uint64(key), c_uint64(val))
self.fn_dict["insert"](self.map_ptr, key, val)

0 comments on commit 8d1eb86

Please sign in to comment.