Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/apidoc/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -2074,6 +2074,7 @@ INCLUDE_FILE_PATTERNS =

PREDEFINED = __attribute__(x)= \
__declspec(x)= \
PARQUET_EXPORT= \
ARROW_EXPORT= \
ARROW_FLIGHT_EXPORT= \
ARROW_EXTERN_TEMPLATE= \
Expand Down
77 changes: 43 additions & 34 deletions cpp/src/arrow/csv/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,82 +32,91 @@ class DataType;

namespace csv {

// Silly workaround for https://github.com/michaeljones/breathe/issues/453
constexpr char kDefaultEscapeChar = '\\';

struct ARROW_EXPORT ParseOptions {
// Parsing options

// Field delimiter
/// Field delimiter
char delimiter = ',';
// Whether quoting is used
/// Whether quoting is used
bool quoting = true;
// Quoting character (if `quoting` is true)
/// Quoting character (if `quoting` is true)
char quote_char = '"';
// Whether a quote inside a value is double-quoted
/// Whether a quote inside a value is double-quoted
bool double_quote = true;
// Whether escaping is used
/// Whether escaping is used
bool escaping = false;
// Escaping character (if `escaping` is true)
char escape_char = '\\';
// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
/// Escaping character (if `escaping` is true)
char escape_char = kDefaultEscapeChar;
/// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
bool newlines_in_values = false;
// Whether empty lines are ignored. If false, an empty line represents
// a single empty value (assuming a one-column CSV file).
/// Whether empty lines are ignored. If false, an empty line represents
/// a single empty value (assuming a one-column CSV file).
bool ignore_empty_lines = true;

/// Create parsing options with default values
static ParseOptions Defaults();
};

struct ARROW_EXPORT ConvertOptions {
// Conversion options

// Whether to check UTF8 validity of string columns
/// Whether to check UTF8 validity of string columns
bool check_utf8 = true;
// Optional per-column types (disabling type inference on those columns)
/// Optional per-column types (disabling type inference on those columns)
std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
// Recognized spellings for null values
/// Recognized spellings for null values
std::vector<std::string> null_values;
// Recognized spellings for boolean values
/// Recognized spellings for boolean true values
std::vector<std::string> true_values;
/// Recognized spellings for boolean false values
std::vector<std::string> false_values;
// Whether string / binary columns can have null values.
// If true, then strings in "null_values" are considered null for string columns.
// If false, then all strings are valid string values.
/// Whether string / binary columns can have null values.
///
/// If true, then strings in "null_values" are considered null for string columns.
/// If false, then all strings are valid string values.
bool strings_can_be_null = false;

// XXX Should we have a separate FilterOptions?

// If non-empty, indicates the names of columns from the CSV file that should
// be actually read and converted (in the vector's order).
// Columns not in this vector will be ignored.
/// If non-empty, indicates the names of columns from the CSV file that should
/// be actually read and converted (in the vector's order).
/// Columns not in this vector will be ignored.
std::vector<std::string> include_columns;
// If false, columns in `include_columns` but not in the CSV file will error out.
// If true, columns in `include_columns` but not in the CSV file will produce
// a column of nulls (whose type is selected using `column_types`,
// or null by default)
// This option is ignored if `include_columns` is empty.
/// If false, columns in `include_columns` but not in the CSV file will error out.
/// If true, columns in `include_columns` but not in the CSV file will produce
/// a column of nulls (whose type is selected using `column_types`,
/// or null by default)
/// This option is ignored if `include_columns` is empty.
bool include_missing_columns = false;

/// Create conversion options with default values, including conventional
/// values for `null_values`, `true_values` and `false_values`
static ConvertOptions Defaults();
};

struct ARROW_EXPORT ReadOptions {
// Reader options

// Whether to use the global CPU thread pool
/// Whether to use the global CPU thread pool
bool use_threads = true;
// Block size we request from the IO layer; also determines the size of
// chunks when use_threads is true
/// Block size we request from the IO layer; also determines the size of
/// chunks when use_threads is true
int32_t block_size = 1 << 20; // 1 MB

// Number of header rows to skip (not including the row of column names, if any)
/// Number of header rows to skip (not including the row of column names, if any)
int32_t skip_rows = 0;
// Column names for the target table.
// If empty, fall back on autogenerate_column_names.
/// Column names for the target table.
/// If empty, fall back on autogenerate_column_names.
std::vector<std::string> column_names;
// Whether to autogenerate column names if `column_names` is empty.
// If true, column names will be of the form "f0", "f1"...
// If false, column names will be read from the first CSV row after `skip_rows`.
/// Whether to autogenerate column names if `column_names` is empty.
/// If true, column names will be of the form "f0", "f1"...
/// If false, column names will be read from the first CSV row after `skip_rows`.
bool autogenerate_column_names = false;

/// Create read options with default values
static ReadOptions Defaults();
};

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/csv/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,15 @@ class InputStream;

namespace csv {

/// A class that reads an entire CSV file into a Arrow Table
class ARROW_EXPORT TableReader {
public:
virtual ~TableReader() = default;

/// Read the entire CSV file and convert it to a Arrow Table
virtual Status Read(std::shared_ptr<Table>* out) = 0;

/// Create a TableReader instance
static Status Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
const ReadOptions&, const ParseOptions&, const ConvertOptions&,
std::shared_ptr<TableReader>* out);
Expand Down
27 changes: 19 additions & 8 deletions cpp/src/arrow/json/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,33 +29,44 @@ class Schema;

namespace json {

enum class UnexpectedFieldBehavior : char { Ignore, Error, InferType };
enum class UnexpectedFieldBehavior : char {
/// Unexpected JSON fields are ignored
Ignore,
/// Unexpected JSON fields error out
Error,
/// Unexpected JSON fields are type-inferred and included in the output
InferType
};

struct ARROW_EXPORT ParseOptions {
// Parsing options

// Optional explicit schema (no type inference, ignores other fields)
/// Optional explicit schema (disables type inference on those fields)
std::shared_ptr<Schema> explicit_schema;

// Whether objects may be printed across multiple lines (for example pretty printed)
// NB: if false, input must end with an empty line
/// Whether objects may be printed across multiple lines (for example pretty-printed)
///
/// If true, parsing may be slower
/// If false, input must end with an empty line
bool newlines_in_values = false;

// How should parse handle fields outside the explicit_schema?
/// How JSON fields outside of explicit_schema (if given) are treated
UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType;

/// Create parsing options with default values
static ParseOptions Defaults();
};

struct ARROW_EXPORT ReadOptions {
// Reader options

// Whether to use the global CPU thread pool
/// Whether to use the global CPU thread pool
bool use_threads = true;
// Block size we request from the IO layer; also determines the size of
// chunks when use_threads is true
/// Block size we request from the IO layer; also determines the size of
/// chunks when use_threads is true
int32_t block_size = 1 << 20; // 1 MB

/// Create read options with default values
static ReadOptions Defaults();
};

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/json/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,17 @@ class InputStream;

namespace json {

/// A class that reads an entire JSON file into a Arrow Table
///
/// The file is expected to consist of individual line-separated JSON objects
class ARROW_EXPORT TableReader {
public:
virtual ~TableReader() = default;

/// Read the entire JSON file and convert it to a Arrow Table
virtual Status Read(std::shared_ptr<Table>* out) = 0;

/// Create a TableReader instance
static Status Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
const ReadOptions&, const ParseOptions&,
std::shared_ptr<TableReader>* out);
Expand Down
60 changes: 39 additions & 21 deletions cpp/src/parquet/arrow/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,22 @@ class ColumnChunkReader;
class ColumnReader;
class RowGroupReader;

// Arrow read adapter class for deserializing Parquet files as Arrow row
// batches.
//
// This interfaces caters for different use cases and thus provides different
// interfaces. In its most simplistic form, we cater for a user that wants to
// read the whole Parquet at once with the FileReader::ReadTable method.
//
// More advanced users that also want to implement parallelism on top of each
// single Parquet files should do this on the RowGroup level. For this, they can
// call FileReader::RowGroup(i)->ReadTable to receive only the specified
// RowGroup as a table.
//
// In the most advanced situation, where a consumer wants to independently read
// RowGroups in parallel and consume each column individually, they can call
// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column
// instance.
//
/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
///
/// This interfaces caters for different use cases and thus provides different
/// interfaces. In its most simplistic form, we cater for a user that wants to
/// read the whole Parquet at once with the `FileReader::ReadTable` method.
///
/// More advanced users that also want to implement parallelism on top of each
/// single Parquet files should do this on the RowGroup level. For this, they can
/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
/// RowGroup as a table.
///
/// In the most advanced situation, where a consumer wants to independently read
/// RowGroups in parallel and consume each column individually, they can call
/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
/// instance.
///
// TODO(wesm): nested data does not always make sense with this user
// interface unless you are only reading a single leaf node from a branch of
// a table. For example:
Expand Down Expand Up @@ -106,11 +105,13 @@ class RowGroupReader;
// arrays
class PARQUET_EXPORT FileReader {
public:
/// Factory function to create a FileReader from a ParquetFileReader and properties
static ::arrow::Status Make(::arrow::MemoryPool* pool,
std::unique_ptr<ParquetFileReader> reader,
const ArrowReaderProperties& properties,
std::unique_ptr<FileReader>* out);

/// Factory function to create a FileReader from a ParquetFileReader
static ::arrow::Status Make(::arrow::MemoryPool* pool,
std::unique_ptr<ParquetFileReader> reader,
std::unique_ptr<FileReader>* out);
Expand All @@ -127,7 +128,9 @@ class PARQUET_EXPORT FileReader {
/// \brief Return arrow schema for all the columns.
virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;

// Read column as a whole into an Array.
/// \brief Read column as a whole into a chunked array.
///
/// The indicated column index is relative to the schema
virtual ::arrow::Status ReadColumn(int i,
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;

Expand Down Expand Up @@ -183,11 +186,12 @@ class PARQUET_EXPORT FileReader {
return ::arrow::Status::OK();
}

// Read a table of columns into a Table
/// Read all columns into a Table
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;

// Read a table of columns into a Table. Read only the indicated column
// indices (relative to the schema)
/// \brief Read the given columns into a Table
///
/// The indicated column indices are relative to the schema
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
std::shared_ptr<::arrow::Table>* out) = 0;

Expand All @@ -212,6 +216,7 @@ class PARQUET_EXPORT FileReader {
/// FileReader.
virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;

/// \brief The number of row groups in the file
virtual int num_row_groups() const = 0;

virtual ParquetFileReader* parquet_reader() const = 0;
Expand Down Expand Up @@ -270,14 +275,18 @@ class PARQUET_EXPORT FileReaderBuilder {
public:
FileReaderBuilder();

/// Create FileReaderBuilder from Arrow file and optional properties / metadata
::arrow::Status Open(const std::shared_ptr<::arrow::io::RandomAccessFile>& file,
const ReaderProperties& properties = default_reader_properties(),
const std::shared_ptr<FileMetaData>& metadata = NULLPTR);

ParquetFileReader* raw_reader() { return raw_reader_.get(); }

/// Set Arrow MemoryPool for memory allocation
FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
/// Set Arrow reader properties
FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
/// Build FileReader instance
::arrow::Status Build(std::unique_ptr<FileReader>* out);

private:
Expand All @@ -286,6 +295,13 @@ class PARQUET_EXPORT FileReaderBuilder {
std::unique_ptr<ParquetFileReader> raw_reader_;
};

/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
///
/// @{

/// \brief Build FileReader from Arrow file and MemoryPool
///
/// Advanced settings are supported through the FileReaderBuilder class.
PARQUET_EXPORT
::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file,
::arrow::MemoryPool* allocator,
Expand All @@ -306,6 +322,8 @@ ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& f
const ArrowReaderProperties& properties,
std::unique_ptr<FileReader>* reader);

/// @}

PARQUET_EXPORT
::arrow::Status FromParquetSchema(
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
Expand Down
17 changes: 7 additions & 10 deletions cpp/src/parquet/arrow/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,10 @@ class ParquetFileWriter;

namespace arrow {

/**
* Iterative API:
* Start a new RowGroup/Chunk with NewRowGroup
* Write column-by-column the whole column chunk
*/
/// \brief Iterative FileWriter class
///
/// Start a new RowGroup or Chunk with NewRowGroup.
/// Write column-by-column the whole column chunk.
class PARQUET_EXPORT FileWriter {
public:
static ::arrow::Status Make(
Expand Down Expand Up @@ -99,11 +98,9 @@ PARQUET_EXPORT
::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink);

/**
* Write a Table to Parquet.
*
* The table shall only consist of columns of primitive type or of primitive lists.
*/
/// \brief Write a Table to Parquet.
///
/// The table shall only consist of columns of primitive type or of primitive lists.
::arrow::Status PARQUET_EXPORT WriteTable(
const ::arrow::Table& table, MemoryPool* pool,
const std::shared_ptr<::arrow::io::OutputStream>& sink, int64_t chunk_size,
Expand Down
1 change: 1 addition & 0 deletions docs/source/cpp/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ API Reference
api/tensor
api/utilities
api/io
api/formats
api/cuda
api/flight
api/filesystem
Loading