Skip to content

Commit 893f1fd

Browse files
committed
gaussian cubes infra
1 parent 990a924 commit 893f1fd

11 files changed

+405
-152
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
*~
22
.DS_Store
3-
3+
CMakeCache.txt
4+
CMakeFiles
45
build
56
local_data
67
bin

CMakeLists.txt

+4-17
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,13 @@ if(Boost_FOUND)
1212
include_directories(${Boost_INCLUDE_DIRS})
1313
endif()
1414

15-
1615
# include(cmake/FindArrow.cmake) findarrow doesn't work, so we'll do it by hand
16+
# let me say this will all my heart: fuck cmake
1717

18-
set (ARROW_INCLUDE_DIR /Users/cscheid/repos/github/apache/arrow/cpp/src)
18+
set (ARROW_BASE_PATH /Users/cscheid/repos/github/apache/arrow/cpp)
19+
set (ARROW_INCLUDE_DIR ${ARROW_BASE_PATH}/src)
1920
# set (ARROW_LIBRARY_DIR /Users/cscheid/repos/github/apache/arrow/cpp/release/release)
20-
set (ARROW_LIBRARY_DIR /Users/cscheid/repos/github/apache/arrow/cpp/debug/debug)
21-
22-
# let me say this will all my heart: fuck cmake
21+
set (ARROW_LIBRARY_DIR ${ARROW_BASE_PATH}/release/release)
2322

2423
################################################################################
2524

@@ -97,15 +96,3 @@ target_include_directories(test_ztf_arrow PUBLIC ${ARROW_INCLUDE_DIR})
9796
target_link_directories(test_ztf_arrow PUBLIC ${ARROW_LIBRARY_DIR})
9897

9998
target_link_libraries(test_ztf_arrow PUBLIC arrow)
100-
101-
################################################################################
102-
103-
add_executable(test_arrow_write
104-
src/test_arrow_write
105-
)
106-
107-
target_include_directories(test_arrow_write PUBLIC ${ARROW_INCLUDE_DIR})
108-
109-
target_link_directories(test_arrow_write PUBLIC ${ARROW_LIBRARY_DIR})
110-
111-
target_link_libraries(test_arrow_write PUBLIC arrow)

client/js/agg.arrow

-7.41 MB
Binary file not shown.

client/js/index.js

+39-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,40 @@
11
import init, * as arrow from "./pkg/arrow_wasm.js";
22

3+
function vectorToArray(vec, type)
4+
{
5+
if (type.name === 'floatingpoint' && type.precision === 'DOUBLE') {
6+
return vec.asFloat64Vector().toArray();
7+
} else if (type.name === 'floatingpoint' && type.precision === 'SINGLE') {
8+
return vec.asFloat32Vector().toArray();
9+
} else if (type.name === 'int' && type.bitWidth === 64 && type.isSigned) {
10+
return vec.asInt64Vector().toArray();
11+
} else if (type.name === 'int' && type.bitWidth === 64 && !type.isSigned) {
12+
return vec.asUint64Vector().toArray();
13+
} else if (type.name === 'int' && type.bitWidth === 32 && type.isSigned) {
14+
return vec.asInt32Vector().toArray();
15+
} else if (type.name === 'int' && type.bitWidth === 32 && !type.isSigned) {
16+
return vec.asUint32Vector().toArray();
17+
} else {
18+
throw new Error(`Unrecognized type: ${JSON.stringify(type)}`);
19+
}
20+
}
21+
22+
function tableToDictOfArrays(table)
23+
{
24+
let schema = table.schema.toJSON();
25+
let result = {};
26+
schema.fields.forEach((field, i) => {
27+
let name = field.name;
28+
let chunks = [];
29+
for (let batch = 0; batch < table.numBatches; ++batch) {
30+
chunks.push(vectorToArray(table.recordBatch(batch).column(i),
31+
schema.fields[i].type));
32+
}
33+
result[name] = chunks;
34+
});
35+
return result;
36+
}
37+
338
async function run() {
439
await init();
540

@@ -9,8 +44,11 @@ async function run() {
944
const data = await response.arrayBuffer();
1045
const contents = new Uint8Array(data);
1146

12-
const table = arrow.Table.from(contents);
47+
let table = arrow.Table.from(contents);
1348
console.log(table.schema.toJSON());
49+
table = tableToDictOfArrays(table);
50+
51+
debugger;
1452
}
1553

1654
run();

src/arrow_utils.cc

+65
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#include <iostream>
88
#include <algorithm>
9+
#include <sstream>
10+
#include <memory>
911

1012
#include "arrow_convenience.h"
1113

@@ -42,6 +44,21 @@ shared_ptr<Table> make_table(
4244
return arrow::Table::Make(arrow::schema(fields), arrays);
4345
}
4446

47+
void write_arrow(
48+
shared_ptr<Table> table,
49+
const string &path,
50+
int64_t max_chunk_size)
51+
{
52+
shared_ptr<io::FileOutputStream>
53+
file = io::FileOutputStream::Open(path).ValueOrDie();
54+
55+
shared_ptr<ipc::RecordBatchWriter>
56+
writer = ipc::MakeFileWriter(file, table->schema()).ValueOrDie();
57+
58+
OK_OR_DIE(writer->WriteTable(*table, max_chunk_size));
59+
OK_OR_DIE(writer->Close());
60+
}
61+
4562
/******************************************************************************/
4663

4764
shared_ptr<Table> read_feather_table(const std::string& path)
@@ -248,3 +265,51 @@ sort_table(std::shared_ptr<Table> input,
248265
"sort_indices", { Datum(input) }, options).ValueOrDie().make_array();
249266
return permute_table(input, sort_permutation);
250267
}
268+
269+
/******************************************************************************/
270+
271+
shared_ptr<Table>
272+
make_gaussian_stats_table(
273+
const vector<shared_ptr<ChunkedArray>> &cols)
274+
{
275+
vector<NumericBuilder<DoubleType>>
276+
builders((cols.size() + 1) * (cols.size() + 2) / 2);
277+
int sz = (int) cols.size();
278+
RowIterator(cols).for_each([&builders, &sz](RowIterator &rows) {
279+
size_t ix = 0;
280+
for (int i = -1; i < sz; ++i) {
281+
double v1 = (i == -1) ? 1.0 : rows.cols_[i].value<DoubleType>();
282+
for (int j = i; j < sz; ++j) {
283+
double v2 = (j == -1) ? 1.0 : rows.cols_[j].value<DoubleType>();
284+
OK_OR_DIE(builders[ix++].Append(v1 * v2));
285+
}
286+
}
287+
});
288+
unordered_map<string, shared_ptr<ChunkedArray>> result;
289+
size_t ix = 0;
290+
for (auto &builder: builders) {
291+
stringstream ss;
292+
ss << "s" << (ix++);
293+
result[ss.str()] = shared_ptr<ChunkedArray>(new ChunkedArray({ builder.Finish().ValueOrDie() }));
294+
}
295+
return make_table(result);
296+
}
297+
298+
shared_ptr<Table>
299+
make_gaussian_stats_table(
300+
shared_ptr<Table> t,
301+
const vector<string> &col_names)
302+
{
303+
vector<shared_ptr<ChunkedArray>>
304+
cols;
305+
for (auto &name: col_names) {
306+
cols.push_back(t->GetColumnByName(name));
307+
}
308+
return make_gaussian_stats_table(cols);
309+
}
310+
311+
shared_ptr<Table>
312+
make_gaussian_stats_table(shared_ptr<Table> t)
313+
{
314+
return make_gaussian_stats_table(t->columns());
315+
}

src/arrow_utils.h

+45
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
#pragma once
22

33
#include <arrow/compute/api_vector.h>
4+
#include <arrow/io/file.h>
5+
#include <arrow/io/api.h>
46
#include <arrow/builder.h>
57
#include <arrow/array.h>
68
#include <arrow/table.h>
9+
#include <arrow/ipc/writer.h>
10+
711
#include <memory>
812
#include <unordered_map>
913

@@ -29,6 +33,11 @@ void arrow_foreach(
2933
std::shared_ptr<arrow::ChunkedArray> column_2,
3034
T closure);
3135

36+
void write_arrow(
37+
std::shared_ptr<arrow::Table> table,
38+
const std::string &path,
39+
int64_t max_chunk_size = -1);
40+
3241
/******************************************************************************/
3342
/// cbind: combines columns from two different tables.
3443
///
@@ -255,6 +264,19 @@ struct ChunkedArrayIterator
255264
// Do not reuse RowIterators!
256265
struct RowIterator
257266
{
267+
explicit RowIterator(std::shared_ptr<arrow::Table> table,
268+
const std::vector<std::string> &names,
269+
bool skip_null=true)
270+
{
271+
std::vector<std::shared_ptr<arrow::ChunkedArray>> cols;
272+
for (auto &name: names) {
273+
cols_.push_back(ChunkedArrayIterator(table->GetColumnByName(name)));
274+
}
275+
if (skip_null) {
276+
ensure_not_null();
277+
}
278+
}
279+
258280
explicit RowIterator(const std::vector<ChunkedArrayIterator> &cols,
259281
bool skip_null=true)
260282
: cols_(cols) {
@@ -364,6 +386,29 @@ template <typename ArrowType, typename T>
364386
std::shared_ptr<arrow::ChunkedArray>
365387
map_rows(RowIterator &rows, T closure);
366388

389+
// same as above, but construct the RowIterator internally
390+
// given a table and some columns
391+
template <typename ArrowType, typename T>
392+
std::shared_ptr<arrow::ChunkedArray>
393+
map_rows(std::shared_ptr<arrow::Table> t,
394+
const std::vector<std::string> &col_names,
395+
T closure);
396+
397+
/******************************************************************************/
398+
// create the table of sufficient statistics to fit gaussians.
399+
// this is (|cols|+1)(|cols|+2)/2
400+
//
401+
// the selected columns must all be DoubleType
402+
403+
std::shared_ptr<arrow::Table>
404+
make_gaussian_stats_table(
405+
std::shared_ptr<arrow::Table> t,
406+
const std::vector<std::string> &cols);
407+
408+
// This assumes all columns are to be selected.
409+
std::shared_ptr<arrow::Table>
410+
make_gaussian_stats_table(std::shared_ptr<arrow::Table> t);
411+
367412
/******************************************************************************/
368413

369414
#include "arrow_utils.hh"

src/arrow_utils.hh

+17
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,20 @@ map_rows(RowIterator &rows, T closure)
119119
std::cerr << "Done!" << std::endl;
120120
return std::shared_ptr<arrow::ChunkedArray>(new arrow::ChunkedArray({ builder.Finish().ValueOrDie() }));
121121
}
122+
123+
// same as above, but construct the RowIterator internally
124+
// given a table and some columns
125+
template <typename ArrowType, typename T>
126+
std::shared_ptr<arrow::ChunkedArray>
127+
map_rows(std::shared_ptr<arrow::Table> t,
128+
const std::vector<std::string> &col_names,
129+
T closure)
130+
{
131+
std::vector<std::shared_ptr<arrow::ChunkedArray>> cols;
132+
for (auto &name: col_names) {
133+
cols.push_back(t->GetColumnByName(name));
134+
}
135+
RowIterator rows(cols);
136+
return map_rows<ArrowType, T>(rows, closure);
137+
}
138+

0 commit comments

Comments
 (0)