-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-7759: [C++][Dataset] Add CsvFileFormat #7033
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
598558e
b843048
920c269
fb443d3
3ce9e15
f69f1fb
2052925
5179b5b
447b1d7
02593f3
58a1059
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,136 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "arrow/dataset/file_csv.h" | ||
|
|
||
| #include <algorithm> | ||
| #include <memory> | ||
| #include <string> | ||
| #include <utility> | ||
|
|
||
| #include "arrow/csv/options.h" | ||
| #include "arrow/csv/reader.h" | ||
| #include "arrow/dataset/dataset_internal.h" | ||
| #include "arrow/dataset/file_base.h" | ||
| #include "arrow/dataset/filter.h" | ||
| #include "arrow/dataset/type_fwd.h" | ||
| #include "arrow/dataset/visibility.h" | ||
| #include "arrow/result.h" | ||
| #include "arrow/type.h" | ||
| #include "arrow/util/iterator.h" | ||
|
|
||
| namespace arrow { | ||
| namespace dataset { | ||
|
|
||
| using internal::checked_cast; | ||
| using internal::checked_pointer_cast; | ||
|
|
||
| static inline Result<csv::ConvertOptions> GetConvertOptions( | ||
| const CsvFileFormat& format, const std::shared_ptr<ScanOptions>& scan_options) { | ||
| auto options = csv::ConvertOptions::Defaults(); | ||
| if (scan_options != nullptr) { | ||
| // This is set to true to match behavior with other formats; a missing column | ||
| // will be materialized as null. | ||
| options.include_missing_columns = true; | ||
|
|
||
| for (const auto& field : scan_options->schema()->fields()) { | ||
| options.column_types[field->name()] = field->type(); | ||
| options.include_columns.push_back(field->name()); | ||
| } | ||
|
|
||
| // FIXME(bkietz) also acquire types of fields materialized but not projected. | ||
| for (auto&& name : FieldsInExpression(scan_options->filter)) { | ||
| ARROW_ASSIGN_OR_RAISE(auto match, | ||
| FieldRef(name).FindOneOrNone(*scan_options->schema())); | ||
| if (match.indices().empty()) { | ||
| options.include_columns.push_back(std::move(name)); | ||
| } | ||
| } | ||
| } | ||
| return options; | ||
| } | ||
|
|
||
| static inline csv::ReadOptions GetReadOptions(const CsvFileFormat& format) { | ||
| auto options = csv::ReadOptions::Defaults(); | ||
| // Multithreaded conversion of individual files would lead to excessive thread | ||
| // contention when ScanTasks are also executed in multiple threads, so we disable it | ||
| // here. | ||
|
||
| options.use_threads = false; | ||
| return options; | ||
| } | ||
|
|
||
| static inline Result<std::shared_ptr<csv::StreamingReader>> OpenReader( | ||
| const FileSource& source, const CsvFileFormat& format, | ||
| const std::shared_ptr<ScanOptions>& options = nullptr, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd take a ConvertOption instead of a ScanTask.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what you mean |
||
| MemoryPool* pool = default_memory_pool()) { | ||
| ARROW_ASSIGN_OR_RAISE(auto input, source.Open()); | ||
|
|
||
| auto reader_options = GetReadOptions(format); | ||
| const auto& parse_options = format.parse_options; | ||
| ARROW_ASSIGN_OR_RAISE(auto convert_options, GetConvertOptions(format, options)); | ||
| auto maybe_reader = csv::StreamingReader::Make(pool, std::move(input), reader_options, | ||
| parse_options, convert_options); | ||
| if (!maybe_reader.ok()) { | ||
| return maybe_reader.status().WithMessage("Could not open CSV input source '", | ||
| source.path(), "': ", maybe_reader.status()); | ||
| } | ||
|
|
||
| return std::move(maybe_reader).ValueOrDie(); | ||
| } | ||
|
|
||
| /// \brief A ScanTask backed by an Csv file. | ||
| class CsvScanTask : public ScanTask { | ||
| public: | ||
| CsvScanTask(std::shared_ptr<const CsvFileFormat> format, FileSource source, | ||
| std::shared_ptr<ScanOptions> options, std::shared_ptr<ScanContext> context) | ||
| : ScanTask(std::move(options), std::move(context)), | ||
| format_(std::move(format)), | ||
| source_(std::move(source)) {} | ||
|
|
||
| Result<RecordBatchIterator> Execute() override { | ||
| ARROW_ASSIGN_OR_RAISE(auto reader, | ||
| OpenReader(source_, *format_, options(), context()->pool)); | ||
| return IteratorFromReader(std::move(reader)); | ||
| } | ||
|
|
||
| private: | ||
| std::shared_ptr<const CsvFileFormat> format_; | ||
| FileSource source_; | ||
| }; | ||
|
|
||
| Result<bool> CsvFileFormat::IsSupported(const FileSource& source) const { | ||
| RETURN_NOT_OK(source.Open().status()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't you let OpenReader fail?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If source fails to open (for example if it points to a file which doesn't exist) that should raise an error rather than returning false, which is what this line detects. |
||
| return OpenReader(source, *this).ok(); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<Schema>> CsvFileFormat::Inspect(const FileSource& source) const { | ||
| ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source, *this)); | ||
| return reader->schema(); | ||
| } | ||
|
|
||
| Result<ScanTaskIterator> CsvFileFormat::ScanFile( | ||
| const FileSource& source, std::shared_ptr<ScanOptions> options, | ||
| std::shared_ptr<ScanContext> context) const { | ||
| auto this_ = checked_pointer_cast<const CsvFileFormat>(shared_from_this()); | ||
| auto task = std::make_shared<CsvScanTask>(std::move(this_), source, std::move(options), | ||
| std::move(context)); | ||
|
|
||
| return MakeVectorIterator<std::shared_ptr<ScanTask>>({std::move(task)}); | ||
| } | ||
|
|
||
| } // namespace dataset | ||
| } // namespace arrow | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <memory> | ||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| #include "arrow/csv/options.h" | ||
| #include "arrow/dataset/file_base.h" | ||
| #include "arrow/dataset/type_fwd.h" | ||
| #include "arrow/dataset/visibility.h" | ||
| #include "arrow/result.h" | ||
|
|
||
| namespace arrow { | ||
| namespace dataset { | ||
|
|
||
| /// \brief A FileFormat implementation that reads from and writes to Csv files | ||
| class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { | ||
| public: | ||
| /// Options affecting the parsing of CSV files | ||
| csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); | ||
bkietz marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| std::string type_name() const override { return "csv"; } | ||
|
|
||
| Result<bool> IsSupported(const FileSource& source) const override; | ||
|
|
||
| /// \brief Return the schema of the file if possible. | ||
| Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override; | ||
|
|
||
| /// \brief Open a file for scanning | ||
| Result<ScanTaskIterator> ScanFile(const FileSource& source, | ||
| std::shared_ptr<ScanOptions> options, | ||
| std::shared_ptr<ScanContext> context) const override; | ||
| }; | ||
|
|
||
| } // namespace dataset | ||
| } // namespace arrow | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would expect the FilterAndProjectScanTask to fix this in a more efficient way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just to stop conversion from erroring if a column is projected but absent from the file. Another way to handle this would be: restrict include_colums to columns present in the file then let the projector handle the rest as you say, but that would require knowledge of the file's columns which we don't have at this stage