diff --git a/src/bufr/BufrParser/BufrParser.cpp b/src/bufr/BufrParser/BufrParser.cpp index 4686012ae..00160e0d7 100644 --- a/src/bufr/BufrParser/BufrParser.cpp +++ b/src/bufr/BufrParser/BufrParser.cpp @@ -48,7 +48,8 @@ namespace Ingester { { auto startTime = std::chrono::steady_clock::now(); - auto querySet = bufr::QuerySet(); + auto querySet = bufr::QuerySet(description_.getExport().getSubsets()); + for (const auto &var : description_.getExport().getVariables()) { for (const auto &queryPair : var->getQueryList()) diff --git a/src/bufr/BufrParser/Exports/Export.cpp b/src/bufr/BufrParser/Exports/Export.cpp index b421af3ad..7c3fafee8 100644 --- a/src/bufr/BufrParser/Exports/Export.cpp +++ b/src/bufr/BufrParser/Exports/Export.cpp @@ -28,6 +28,7 @@ namespace const char* Splits = "splits"; const char* Variables = "variables"; const char* GroupByVariable = "group_by_variable"; + const char* Subsets = "subsets"; namespace Variable { @@ -75,6 +76,11 @@ namespace Ingester groupByVariable = conf.getString(ConfKeys::GroupByVariable); } + if (conf.has(ConfKeys::Subsets)) + { + subsets_ = conf.getStringVector(ConfKeys::Subsets); + } + if (conf.has(ConfKeys::Variables)) { addVariables(conf.getSubConfiguration(ConfKeys::Variables), diff --git a/src/bufr/BufrParser/Exports/Export.h b/src/bufr/BufrParser/Exports/Export.h index 51a574ac7..baed395d5 100644 --- a/src/bufr/BufrParser/Exports/Export.h +++ b/src/bufr/BufrParser/Exports/Export.h @@ -35,11 +35,13 @@ namespace Ingester inline Splits getSplits() const { return splits_; } inline Variables getVariables() const { return variables_; } inline Filters getFilters() const { return filters_; } + inline std::vector getSubsets() const { return subsets_; } private: Splits splits_; Variables variables_; Filters filters_; + std::vector subsets_; /// \brief Create Variables exports from config. void addVariables(const eckit::Configuration &conf, diff --git a/src/bufr/BufrParser/Exports/Splits/CategorySplit.cpp b/src/bufr/BufrParser/Exports/Splits/CategorySplit.cpp index 77c51fba4..8d30aa319 100644 --- a/src/bufr/BufrParser/Exports/Splits/CategorySplit.cpp +++ b/src/bufr/BufrParser/Exports/Splits/CategorySplit.cpp @@ -83,11 +83,10 @@ namespace Ingester auto location = Location(dataObject->getDims().size(), 0); location[0] = rowIdx; - auto itemVal = dataObject->getAsFloat(location); - if (trunc(itemVal) == itemVal) + if (auto dat = std::dynamic_pointer_cast> (dataObject)) { - nameMap_.insert({static_cast (itemVal), - std::to_string(static_cast (itemVal))}); + auto itemVal = dat->get(location); + nameMap_.insert({itemVal, std::to_string(itemVal)}); } else { diff --git a/src/bufr/BufrParser/Exports/Variables/DatetimeVariable.cpp b/src/bufr/BufrParser/Exports/Variables/DatetimeVariable.cpp index d20bd149c..50116edbc 100644 --- a/src/bufr/BufrParser/Exports/Variables/DatetimeVariable.cpp +++ b/src/bufr/BufrParser/Exports/Variables/DatetimeVariable.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -71,8 +72,7 @@ namespace Ingester std::shared_ptr DatetimeVariable::exportData(const BufrDataMap& map) { checkKeys(map); - static const float missing = 1.e+11; - static const int64_t missing_int = INT_MIN; + static const int missingInt = DataObject::missingValue(); std::tm tm{}; // zero initialise tm.tm_year = 1970-1900; // 1970 @@ -83,26 +83,38 @@ namespace Ingester tm.tm_sec = 0; tm.tm_isdst = 0; // Not daylight saving std::time_t epochDt = std::mktime(&tm); - std::time_t this_time = std::mktime(&tm); - int64_t diff_time; std::vector timeOffsets; timeOffsets.reserve(map.at(getExportKey(ConfKeys::Year))->size()); + // Validation + if (map.at(getExportKey(ConfKeys::Year))->getDims().size() != 1 || + map.at(getExportKey(ConfKeys::Month))->getDims().size() != 1 || + map.at(getExportKey(ConfKeys::Day))->getDims().size() != 1 || + (!minuteQuery_.empty() && + map.at(getExportKey(ConfKeys::Minute))->getDims().size() != 1) || + (!secondQuery_.empty() && + map.at(getExportKey(ConfKeys::Second))->getDims().size() != 1)) + { + std::ostringstream errStr; + errStr << "Datetime variables must be 1 dimensional."; + throw eckit::BadParameter(errStr.str()); + } + for (unsigned int idx = 0; idx < map.at(getExportKey(ConfKeys::Year))->size(); idx++) { - int year = static_cast(map.at(getExportKey(ConfKeys::Year))->getAsFloat(idx)); - int month = static_cast(map.at(getExportKey(ConfKeys::Month))->getAsFloat(idx)); - int day = static_cast(map.at(getExportKey(ConfKeys::Day))->getAsFloat(idx)); - int hour = static_cast(map.at(getExportKey(ConfKeys::Hour))->getAsFloat(idx)); + int year = map.at(getExportKey(ConfKeys::Year))->getAsInt(idx); + int month = map.at(getExportKey(ConfKeys::Month))->getAsInt(idx); + int day = map.at(getExportKey(ConfKeys::Day))->getAsInt(idx); + int hour = map.at(getExportKey(ConfKeys::Hour))->getAsInt(idx); int minutes = 0; int seconds = 0; - diff_time = missing_int; - if (year != missing && - month != missing && - day != missing && - hour != missing) + auto diff_time = DataObject::missingValue(); + if (year != missingInt && + month != missingInt && + day != missingInt && + hour != missingInt) { tm.tm_year = year - 1900; tm.tm_mon = month - 1; @@ -114,8 +126,7 @@ namespace Ingester if (!minuteQuery_.empty()) { - minutes = - static_cast(map.at(getExportKey(ConfKeys::Minute))->getAsFloat(idx)); + minutes = map.at(getExportKey(ConfKeys::Minute))->getAsInt(idx); if (minutes >= 0 && minutes < 60) { @@ -125,8 +136,7 @@ namespace Ingester if (!secondQuery_.empty()) { - seconds = - static_cast(map.at(getExportKey(ConfKeys::Second))->getAsFloat(idx)); + seconds = map.at(getExportKey(ConfKeys::Second))->getAsInt(idx); if (seconds >= 0 && seconds < 60) { @@ -134,16 +144,18 @@ namespace Ingester } } - this_time = std::mktime(&tm); - if (this_time < 0) + // Be careful with mktime as it can be very slow. + auto thisTime = std::mktime(&tm); + if (thisTime < 0) { oops::Log::warning() << "Caution, date suspicious date (year, month, day): " << year << ", " << month << ", " << day << std::endl; } - diff_time = static_cast(difftime(this_time, epochDt) - + hoursFromUtc_*3600); + + diff_time = static_cast(difftime(thisTime, epochDt) + + hoursFromUtc_ * 3600); } timeOffsets.push_back(diff_time); diff --git a/src/bufr/BufrParser/Query/File.cpp b/src/bufr/BufrParser/Query/File.cpp index 2684baf30..d05aa582d 100644 --- a/src/bufr/BufrParser/Query/File.cpp +++ b/src/bufr/BufrParser/Query/File.cpp @@ -7,9 +7,11 @@ #include "File.h" +#include + #include "bufr_interface.h" -#include "Query.h" +#include "QueryRunner.h" #include "QuerySet.h" #include "DataProvider.h" @@ -61,7 +63,7 @@ namespace bufr { { static int SubsetLen = 9; unsigned int messageNum = 0; - char subset[SubsetLen]; + char subsetChars[SubsetLen]; int iddate; int bufrLoc; @@ -70,21 +72,27 @@ namespace bufr { auto dataProvider = DataProvider(fileUnit_); auto resultSet = ResultSet(querySet.names()); - auto query = Query(querySet, resultSet, dataProvider); + auto queryRunner = QueryRunner(querySet, resultSet, dataProvider); - while (ireadmg_f(fileUnit_, subset, &iddate, SubsetLen) == 0) + while (ireadmg_f(fileUnit_, subsetChars, &iddate, SubsetLen) == 0) { - while (ireadsb_f(fileUnit_) == 0) + auto subset = std::string(subsetChars); + subset.erase(std::remove_if(subset.begin(), subset.end(), isspace), subset.end()); + + if (querySet.includesSubset(subset)) { - status_f(fileUnit_, &bufrLoc, &il, &im); - dataProvider.updateData(bufrLoc); - query.query(); + while (ireadsb_f(fileUnit_) == 0) + { + status_f(fileUnit_, &bufrLoc, &il, &im); + dataProvider.updateData(bufrLoc); + queryRunner.accumulate(); + } + + if (next > 0 && ++messageNum >= next) break; } - - if (next > 0 && ++messageNum >= next) break; } - resultSet.setTargets(query.getTargets()); + resultSet.setTargets(queryRunner.getTargets()); dataProvider.deleteData(); diff --git a/src/bufr/BufrParser/Query/QueryParser.cpp b/src/bufr/BufrParser/Query/QueryParser.cpp index e2d91ebad..28ea941f5 100644 --- a/src/bufr/BufrParser/Query/QueryParser.cpp +++ b/src/bufr/BufrParser/Query/QueryParser.cpp @@ -14,7 +14,19 @@ namespace Ingester { namespace bufr { - std::vector QueryParser::splitMultiquery(const std::string &query) { + std::vector QueryParser::parse(const std::string& queryStr) + { + std::vector queries; + for (auto& subStr : QueryParser::splitMultiquery(queryStr)) + { + queries.emplace_back(QueryParser::splitQueryStr(subStr)); + } + + return queries; + } + + std::vector QueryParser::splitMultiquery(const std::string &query) + { std::vector subqueries; // Remove whitespace from query and assign to working_str @@ -66,11 +78,8 @@ namespace bufr { return subqueries; } - - void QueryParser::splitQueryStr(const std::string& query, - std::string& subset, - std::vector& mnemonics, - int& index) { + Query QueryParser::splitQueryStr(const std::string& query) + { // Find positions of slashes std::vector slashPositions; size_t slashIdx = 0; @@ -89,7 +98,7 @@ namespace bufr { } // Capture the subset string - subset = query.substr(0, slashPositions[0]); + auto subset = query.substr(0, slashPositions[0]); std::vector mnemonicStrings(slashPositions.size()); @@ -105,7 +114,7 @@ namespace bufr { std::string lastElement = query.substr(slashPositions[slashPositions.size() - 1] + 1); // Parse last element - index = -1; + int index = -1; size_t startSubscript = lastElement.find_first_of("["); size_t endSubscript = lastElement.find_first_of("]"); if (startSubscript != std::string::npos && endSubscript != std::string::npos) @@ -126,7 +135,13 @@ namespace bufr { mnemonicStrings.back() = lastElement; } - mnemonics = mnemonicStrings; + auto queryObj = Query(); + queryObj.queryStr = query; + queryObj.subset = subset; + queryObj.mnemonics = mnemonicStrings; + queryObj.index = index; + + return queryObj; } } // namespace bufr } // namespace Ingester diff --git a/src/bufr/BufrParser/Query/QueryParser.h b/src/bufr/BufrParser/Query/QueryParser.h index 1f860cea2..dd360e78e 100644 --- a/src/bufr/BufrParser/Query/QueryParser.h +++ b/src/bufr/BufrParser/Query/QueryParser.h @@ -13,24 +13,29 @@ namespace Ingester { namespace bufr { + struct Query + { + std::string queryStr; + std::string subset; + std::vector mnemonics; + int index; + }; + /// \brief Parses a user supplied query string into its component parts. /// \note Will be refactored to properly tokenize the query string. class QueryParser { public: + static std::vector parse(const std::string& queryStr); + + private: /// \brief Split a multi query (ex: ["*/CLONH", "*/CLON"]) into a vector of single queries. /// \param query The query to split. static std::vector splitMultiquery(const std::string& query); /// \brief Split a single query (ex: "*/ROSEQ1/ROSEQ2/PCCF[2]") into its component parts. /// \param query The query to split. - /// \param[out] subset The subset part of the query (ex: *). - /// \param[out] mnemonics Query path components (ex: ["ROSEQ1", "ROSEQ2", "PCCF"]). - /// \param[out] index The index associated with this query (ex: 2). - static void splitQueryStr(const std::string& query, - std::string& subset, - std::vector& mnemonics, - int& index); + static Query splitQueryStr(const std::string& query); private: /// \brief Private constructor. diff --git a/src/bufr/BufrParser/Query/Query.cpp b/src/bufr/BufrParser/Query/QueryRunner.cpp similarity index 85% rename from src/bufr/BufrParser/Query/Query.cpp rename to src/bufr/BufrParser/Query/QueryRunner.cpp index af1b685ac..fae955f90 100644 --- a/src/bufr/BufrParser/Query/Query.cpp +++ b/src/bufr/BufrParser/Query/QueryRunner.cpp @@ -4,7 +4,7 @@ * This software is licensed under the terms of the Apache Licence Version 2.0 * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. */ -#include "Query.h" +#include "QueryRunner.h" #include "eckit/exception/Exceptions.h" #include "oops/util/Logger.h" @@ -12,7 +12,6 @@ #include #include -#include "QueryParser.h" #include "Constants.h" namespace Ingester { @@ -23,7 +22,7 @@ namespace bufr { std::vector counts; }; - Query::Query(const QuerySet &querySet, + QueryRunner::QueryRunner(const QuerySet &querySet, ResultSet &resultSet, const DataProvider &dataProvider) : querySet_(querySet), @@ -32,7 +31,8 @@ namespace bufr { { } - void Query::query() { + void QueryRunner::accumulate() + { Targets targets; std::shared_ptr<__details::ProcessingMasks> masks; @@ -40,9 +40,12 @@ namespace bufr { collectData(targets, masks, resultSet_); } - void Query::findTargets(Targets &targets, std::shared_ptr<__details::ProcessingMasks> &masks) { + void QueryRunner::findTargets(Targets &targets, + std::shared_ptr<__details::ProcessingMasks> &masks) + { // Check if the target list for this subset is cached - if (targetCache_.find(dataProvider_.getSubset()) != targetCache_.end()) { + if (targetCache_.find(dataProvider_.getSubset()) != targetCache_.end()) + { targets = targetCache_.at(dataProvider_.getSubset()); masks = maskCache_.at(dataProvider_.getSubset()); return; @@ -55,20 +58,21 @@ namespace bufr { masks->valueNodeMask.resize(numNodes, false); masks->pathNodeMask.resize(numNodes, false); - for (size_t targetIdx = 0; targetIdx < querySet_.size(); ++targetIdx) { - auto queryName = querySet_.nameAt(targetIdx); - auto queryStr = querySet_.queryAt(targetIdx); - - auto subQueries = QueryParser::splitMultiquery(queryStr); + for (size_t targetIdx = 0; targetIdx < querySet_.size(); ++targetIdx) + { + auto queryName = querySet_.names()[targetIdx]; + auto subQueries = querySet_.queriesFor(queryName); bool foundTarget = false; std::shared_ptr target; - for (size_t subQueryIdx = 0; subQueryIdx < subQueries.size(); ++subQueryIdx) { - const std::string &subQuery = subQueries[subQueryIdx]; + for (size_t subQueryIdx = 0; subQueryIdx < subQueries.size(); ++subQueryIdx) + { + const Query& subQuery = subQueries[subQueryIdx]; target = findTarget(queryName, subQuery); - if (target->nodeIds.size() > 0) { + if (target->nodeIds.size() > 0) + { // Collect mask data masks->valueNodeMask[target->nodeIds[0]] = true; for (size_t pathIdx = 0; pathIdx < target->seqPath.size(); ++pathIdx) { @@ -81,14 +85,34 @@ namespace bufr { } } - if (!foundTarget) { + if (!foundTarget) + { // Add the last missing target to the list targets.push_back(target); - oops::Log::warning() << "Warning: Query String " - << queryStr - << " didn't apply to subset " - << dataProvider_.getSubset() - << std::endl; + oops::Log::warning() << "Warning: Query String "; + + auto queries = querySet_.queriesFor(queryName); + + if (queries.size() == 1) + { + oops::Log::warning() << queries[0].queryStr; + } + else + { + oops::Log::warning() << "["; + for (auto subQuery = queries.cbegin(); + subQuery < queries.cend(); + ++subQuery) + { + if (subQuery != queries.cbegin()) oops::Log::warning() << ", "; + oops::Log::warning() << subQuery->queryStr; + } + oops::Log::warning() << "]"; + } + + oops::Log::warning() << " didn't apply to subset "; + oops::Log::warning() << dataProvider_.getSubset(); + oops::Log::warning() << std::endl; } } @@ -96,23 +120,18 @@ namespace bufr { maskCache_.insert({dataProvider_.getSubset(), masks}); } - std::shared_ptr Query::findTarget(const std::string &targetName, - const std::string &query) const { - std::string querySubset; - std::vector mnemonics; - int index; - - QueryParser::splitQueryStr(query, querySubset, mnemonics, index); - + std::shared_ptr QueryRunner::findTarget(const std::string &targetName, + const Query& query) const + { std::vector branches; std::vector targetNodes; std::vector seqPath; std::vector dimPaths; std::vector dimIdxs; - bool targetMissing = !(querySubset == "*" || querySubset == dataProvider_.getSubset()); + bool targetMissing = !(query.subset == "*" || query.subset == dataProvider_.getSubset()); if (!targetMissing) { - branches.resize(mnemonics.size() - 1); + branches.resize(query.mnemonics.size() - 1); seqPath.push_back(dataProvider_.getInode()); @@ -126,7 +145,7 @@ namespace bufr { dataProvider_.getTyp(nodeIdx) == Typ::Repeat || dataProvider_.getTyp(nodeIdx) == Typ::StackedRepeat) { if (isQueryNode(nodeIdx - 1)) { - if (dataProvider_.getTag(nodeIdx) == mnemonics[mnemonicCursor + 1] && + if (dataProvider_.getTag(nodeIdx) == query.mnemonics[mnemonicCursor + 1] && tableCursor == mnemonicCursor) { mnemonicCursor++; branches[mnemonicCursor] = nodeIdx - 1; @@ -134,9 +153,9 @@ namespace bufr { tableCursor++; } seqPath.push_back(nodeIdx); - } else if (mnemonicCursor == static_cast(mnemonics.size()) - 2 && + } else if (mnemonicCursor == static_cast(query.mnemonics.size()) - 2 && tableCursor == mnemonicCursor && - dataProvider_.getTag(nodeIdx) == mnemonics.back()) { + dataProvider_.getTag(nodeIdx) == query.mnemonics.back()) { // We found a target targetNodes.push_back(nodeIdx); getDimInfo(branches, mnemonicCursor, dimPaths, dimIdxs); @@ -195,21 +214,21 @@ namespace bufr { } } - if (index > 0 && index <= gsl::narrow(targetNodes.size())) { - targetNodes = {targetNodes[index - 1]}; + if (query.index > 0 && query.index <= gsl::narrow(targetNodes.size())) { + targetNodes = {targetNodes[query.index - 1]}; } if (targetNodes.size() > 1) { std::ostringstream errMsg; errMsg << "Query string must return 1 target. Are you missing an index? "; - errMsg << query << "."; + errMsg << query.queryStr << "."; throw eckit::BadParameter(errMsg.str()); } } auto target = std::make_shared(); target->name = targetName; - target->queryStr = query; + target->queryStr = query.queryStr; target->seqPath = branches; target->nodeIds = targetNodes; @@ -226,14 +245,14 @@ namespace bufr { return target; } - bool Query::isQueryNode(int nodeIdx) const { + bool QueryRunner::isQueryNode(int nodeIdx) const { return (dataProvider_.getTyp(nodeIdx) == Typ::DelayedRep || dataProvider_.getTyp(nodeIdx) == Typ::FixedRep || dataProvider_.getTyp(nodeIdx) == Typ::DelayedRepStacked || dataProvider_.getTyp(nodeIdx) == Typ::DelayedBinary); } - void Query::getDimInfo(const std::vector &branches, + void QueryRunner::getDimInfo(const std::vector &branches, int mnemonicCursor, std::vector &dimPaths, std::vector &dimIdxs) const { @@ -274,7 +293,7 @@ namespace bufr { } } - void Query::collectData(Targets& targets, + void QueryRunner::collectData(Targets& targets, std::shared_ptr<__details::ProcessingMasks> masks, ResultSet &resultSet) const { std::vector currentPath; diff --git a/src/bufr/BufrParser/Query/Query.h b/src/bufr/BufrParser/Query/QueryRunner.h similarity index 93% rename from src/bufr/BufrParser/Query/Query.h rename to src/bufr/BufrParser/Query/QueryRunner.h index 2075466f2..8e9d0d731 100644 --- a/src/bufr/BufrParser/Query/Query.h +++ b/src/bufr/BufrParser/Query/QueryRunner.h @@ -51,15 +51,17 @@ namespace bufr { } // namespace __details /// \brief Manages the execution of queries against on a BUFR file. - class Query + class QueryRunner { public: /// \brief Constructor. /// \param[in] querySet The set of queries to execute against the BUFR file. /// \param[in, out] resultSet The object used to store the accumulated collected data. /// \param[in] dataProvider The BUFR data provider to use. - Query(const QuerySet& querySet, ResultSet& resultSet, const DataProvider& dataProvider); - void query(); + QueryRunner(const QuerySet& querySet, + ResultSet& resultSet, + const DataProvider& dataProvider); + void accumulate(); Targets getTargets() { @@ -96,8 +98,8 @@ namespace bufr { /// \brief Find the target associated with a specific user provided query string. /// \param[in] targetName The name specified for the target. /// \param[in] query The query string to use. - std::shared_ptr findTarget(const std::string& targetName, - const std::string& query) const; + std::shared_ptr findTarget(const std::string &targetName, + const Query& query) const; /// \brief Does the node idx correspond to an element you'd find in a query string (repeat diff --git a/src/bufr/BufrParser/Query/QuerySet.cpp b/src/bufr/BufrParser/Query/QuerySet.cpp index cd44f8a98..5587136de 100644 --- a/src/bufr/BufrParser/Query/QuerySet.cpp +++ b/src/bufr/BufrParser/Query/QuerySet.cpp @@ -7,20 +7,82 @@ #include "QuerySet.h" +#include +#include namespace Ingester { namespace bufr { + QuerySet::QuerySet(const std::vector& subsets) : + includesAllSubsets_(false), + limitSubsets_(std::set(subsets.begin(), + subsets.end())), + presentSubsets_({}) + { + } + + void QuerySet::add(const std::string& name, const std::string& queryStr) + { + std::vector queries; + for (const auto &query : QueryParser::parse(queryStr)) + { + if (limitSubsets_.empty()) + { + if (query.subset == "*") + { + includesAllSubsets_ = true; + } + + presentSubsets_.insert(query.subset); + } + else + { + if (query.subset == "*") + { + presentSubsets_ = limitSubsets_; + } + else + { + presentSubsets_.insert(query.subset); + + std::vector newSubsets; + std::set_intersection(limitSubsets_.begin(), + limitSubsets_.end(), + presentSubsets_.begin(), + presentSubsets_.end(), + std::back_inserter(newSubsets)); + + presentSubsets_ = std::set(newSubsets.begin(), + newSubsets.end()); + } + } + + queries.emplace_back(query); + } + + queryMap_[name] = queries; + } + + bool QuerySet::includesSubset(const std::string& subset) const + { + bool includesSubset = true; + if (!includesAllSubsets_) + { + includesSubset = (presentSubsets_.find(subset) != presentSubsets_.end()); + } + + return includesSubset; + } + std::vector QuerySet::names() const { std::vector names; - for (auto const& query : queryList_) + for (auto const& query : queryMap_) { names.push_back(query.first); } return names; } - } // namespace bufr } // namespace Ingester diff --git a/src/bufr/BufrParser/Query/QuerySet.h b/src/bufr/BufrParser/Query/QuerySet.h index adcf26622..54864418f 100644 --- a/src/bufr/BufrParser/Query/QuerySet.h +++ b/src/bufr/BufrParser/Query/QuerySet.h @@ -7,46 +7,48 @@ #pragma once +#include #include +#include #include +#include "QueryParser.h" + namespace Ingester { namespace bufr { + typedef std::set Subsets; + /// \brief Manages a collection of queries. class QuerySet { public: - QuerySet() = default; + explicit QuerySet(const std::vector& subsets); ~QuerySet() = default; /// \brief Add a new query to the collection. /// \param[in] name The name of the query. /// \param[in] query The query string. - void add(const std::string& name, const std::string& query) - { - queryList_.push_back({name, query}); - } + void add(const std::string& name, const std::string& query); /// \brief Returns the size of the collection. - size_t size() const { return queryList_.size(); } - - /// \brief Returns the name of the query at the specified index. - /// \param[in] idx The index of the query.. - /// \return The name of the query. - std::string nameAt(size_t idx) const { return queryList_.at(idx).first; } - - /// \brief Returns the query string at the specified index. - /// \param[in] idx The index of the query. - /// \return The query string. - std::string queryAt(size_t idx) const { return queryList_.at(idx).second; } + size_t size() const { return queryMap_.size(); } /// \brief Returns the names of all the queries. /// \return A vector of the names of all the queries. std::vector names() const; + /// \brief Returns a list of subsets. + /// \return A vector of the names of all the queries. + bool includesSubset(const std::string& subset) const; + + std::vector queriesFor(const std::string& name) const { return queryMap_.at(name); } + private: - std::vector> queryList_; + std::unordered_map> queryMap_; + bool includesAllSubsets_; + Subsets limitSubsets_; + Subsets presentSubsets_; }; } // namespace bufr } // namespace Ingester diff --git a/src/bufr/CMakeLists.txt b/src/bufr/CMakeLists.txt index 8d1f27042..824359be0 100644 --- a/src/bufr/CMakeLists.txt +++ b/src/bufr/CMakeLists.txt @@ -43,8 +43,8 @@ list(APPEND _ingester_srcs BufrParser/Query/VectorMath.h BufrParser/Query/QuerySet.h BufrParser/Query/QuerySet.cpp - BufrParser/Query/Query.h - BufrParser/Query/Query.cpp + BufrParser/Query/QueryRunner.h + BufrParser/Query/QueryRunner.cpp BufrParser/Query/QueryParser.h BufrParser/Query/QueryParser.cpp BufrParser/Query/ResultSet.h diff --git a/src/bufr/DataObject.h b/src/bufr/DataObject.h index 5d3766579..9ca4972e6 100644 --- a/src/bufr/DataObject.h +++ b/src/bufr/DataObject.h @@ -110,6 +110,10 @@ namespace Ingester /// \return Float data. virtual float getAsFloat(const Location& loc) const = 0; + /// \brief Get the data at the index as an int. + /// \return Int data. + virtual int getAsInt(size_t idx) const = 0; + /// \brief Get the data at the index as an float. /// \return Float data. virtual float getAsFloat(size_t idx) const = 0; @@ -177,7 +181,7 @@ namespace Ingester { public: typedef T value_type; - constexpr T missingValue() const { return std::numeric_limits::max(); } + static constexpr T missingValue() { return std::numeric_limits::max(); } /// \brief Constructor. /// \param dimensions The dimensions of the data object. @@ -315,7 +319,7 @@ namespace Ingester /// \brief Get the data at the location as an integer. /// \param loc The coordinate for the data point (ex: if data 2d then loc {2,4} gets data /// at that coordinate). - /// \return Integer data. + /// \return Int data. int getAsInt(const Location& loc) const final { return _getAsInt(loc); } /// \brief Get the data at the location as a float. @@ -330,12 +334,22 @@ namespace Ingester /// \return String data. std::string getAsString(const Location& loc) const final { return _getAsString(loc); } - /// \brief Get the data at the index into the internal 1d array as a float. This function + + /// \brief Get the data at the index into the internal 1d array as a int. This function /// gives you direct access to the internal data and doesn't account for dimensional - /// information (its up to the user). Note: getAsFloat(const Location&) is safer. + /// information (its up to the user). Note: getAsInt(const Location&) is safer. + /// \param idx The idx into the internal 1d array. + /// \return Int data. + int getAsInt(size_t idx) const final { return _getAsInt(idx); } + + + /// \brief idx Get the data at the index into the internal 1d array as a float. This + /// function gives you direct access to the internal data and doesn't account for + /// dimensional information (its up to the user). Note: getAsInt(const Location&) + /// is safer. /// \param idx The idx into the internal 1d array. /// \return Float data. - float getAsFloat(size_t idx) const final { return _getAsFloat(idx); } + float getAsFloat(const size_t idx) const final { return _getAsFloat(idx); } /// \brief Slice the dta object according to a list of indices. /// \param rows The indices to slice the data object by. @@ -477,6 +491,24 @@ namespace Ingester return get(loc); } + /// \brief Get the data at the index as a int for numeric data. + /// \return Int data. + template + int _getAsInt(size_t idx, + typename std::enable_if::value, U>::type* = nullptr) const + { + return static_cast(data_[idx]); + } + + /// \brief Get the data at the index as a int for non-numeric data. + /// \return Int data. + template + int _getAsInt(size_t idx, + typename std::enable_if::value, U>::type* = nullptr) const + { + throw std::runtime_error("The stored value is not a number"); + } + /// \brief Get the data at the index as a float for numeric data. /// \return Float data. template diff --git a/src/bufr/README.md b/src/bufr/README.md index 7a0f9e3cc..55f49e8ed 100644 --- a/src/bufr/README.md +++ b/src/bufr/README.md @@ -55,6 +55,10 @@ Defines how to read data from the input BUFR file. Its sections are as follows: ```yaml exports: group_by_variable: longitude # Optional + subsets: + - NC004001 + - NC004002 + - NC004003 variables: timestamp: datetime: @@ -99,7 +103,8 @@ ioda encoder. It has the following sections: * `group_by_variable` _(optional)_ String value that defines the name of the variable to group observations by. If this field is missing then observations will not be grouped. - +* `subsets` _(optional)_ List of subsets that you want to process. If the field is not present then + all subsets will be processed in accordance with the query definitions. * `variables` * **keys** are arbitrary strings (anything you want). They can be referenced in the ioda section. * **values** (One of these types): diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1abd7cb7c..eba7db51f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -216,6 +216,9 @@ if( iodaconv_bufr_ENABLED ) testinput/satwind_Insat_wmo.bufr testinput/vadwinds_wmoBUFR2ioda.yaml testinput/vadwinds_wmo_multi.bufr + testinput/gdas.t12z.aircft.tm00.bufr_d + testinput/bufr_specific_subsets_by_query.yaml + testinput/bufr_specifying_subsets.yaml ) list( APPEND test_output @@ -259,6 +262,7 @@ if( iodaconv_bufr_ENABLED ) testoutput/satwind_Himawari.nc testoutput/satwind_Insat.nc testoutput/vadwinds_wmo_multi.nc + testoutput/bufr_specifying_subsets.nc ) endif() @@ -1216,6 +1220,24 @@ if(iodaconv_bufr_ENABLED) gdas.t00z.sevcsr.tm00.nc ${IODA_CONV_COMP_TOL_ZERO} DEPENDS bufr2ioda.x ) + ecbuild_add_test( TARGET test_iodaconv_bufr_specific_subsets_by_query + TYPE SCRIPT + COMMAND bash + ARGS ${CMAKE_BINARY_DIR}/bin/iodaconv_comp.sh + netcdf + "${CMAKE_BINARY_DIR}/bin/bufr2ioda.x testinput/bufr_specific_subsets_by_query.yaml" + bufr_specifying_subsets.nc ${IODA_CONV_COMP_TOL_ZERO} + DEPENDS bufr2ioda.x ) + + ecbuild_add_test( TARGET test_iodaconv_bufr_specifying_subsets + TYPE SCRIPT + COMMAND bash + ARGS ${CMAKE_BINARY_DIR}/bin/iodaconv_comp.sh + netcdf + "${CMAKE_BINARY_DIR}/bin/bufr2ioda.x testinput/bufr_specifying_subsets.yaml" + bufr_specifying_subsets.nc ${IODA_CONV_COMP_TOL_ZERO} + DEPENDS bufr2ioda.x ) + # FIXME: Greg Thompson # ecbuild_add_test( TARGET test_iodaconv_bufr_aircar # TYPE SCRIPT diff --git a/test/testinput/bufr_specific_subsets_by_query.yaml b/test/testinput/bufr_specific_subsets_by_query.yaml new file mode 100644 index 000000000..a2023f3d1 --- /dev/null +++ b/test/testinput/bufr_specific_subsets_by_query.yaml @@ -0,0 +1,48 @@ +# (C) Copyright 2020 NOAA/NWS/NCEP/EMC +# # # +# # # This software is licensed under the terms of the Apache Licence Version 2.0 +# # # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# +observations: + - obs space: + name: bufr + + obsdatain: "./testinput/gdas.t12z.aircft.tm00.bufr_d" + + exports: + #MetaData + variables: + timestamp: + datetime: + year: "[NC004001/YEAR, NC004002/YEAR, NC004003/YEAR, NC004006/YEAR, NC004009/YEAR, NC004010/YEAR, NC004011/YEAR]" + month: "[NC004001/MNTH, NC004002/MNTH, NC004003/MNTH, NC004006/MNTH, NC004009/MNTH, NC004010/MNTH, NC004011/MNTH]" + day: "[NC004001/DAYS, NC004002/DAYS, NC004003/DAYS, NC004006/DAYS, NC004009/DAYS, NC004010/DAYS, NC004011/DAYS]" + hour: "[NC004001/HOUR, NC004002/HOUR, NC004003/HOUR, NC004006/HOUR, NC004009/HOUR, NC004010/HOUR, NC004011/HOUR]" + minute: "[NC004001/MINU, NC004002/MINU, NC004003/MINU, NC004006/MINU, NC004009/MINU, NC004010/MINU, NC004011/MINU]" + latitude: + query: "[NC004001/CLAT, NC004002/CLAT, NC004003/CLAT, NC004006/CLATH, NC004009/CLATH, NC004010/CLATH, NC004011/CLATH]" + longitude: + query: "[NC004001/CLON, NC004002/CLON, NC004003/CLON, NC004006/CLONH, NC004009/CLONH, NC004010/CLONH, NC004011/CLONH]" + + ioda: + backend: netcdf + obsdataout: "./testrun/bufr_specifying_subsets.nc" + + #MetaData + variables: + - name: "MetaData/dateTime" + source: variables/timestamp + longName: "Datetime" + units: "seconds since 1970-01-01T00:00:00Z" + + - name: "MetaData/latitude" + source: variables/latitude + longName: "Latitude" + units: "degree_north" + range: [-90, 90] + + - name: "MetaData/longitude" + source: variables/longitude + longName: "Longitude" + units: "degree_east" + range: [-180, 180] diff --git a/test/testinput/bufr_specifying_subsets.yaml b/test/testinput/bufr_specifying_subsets.yaml new file mode 100644 index 000000000..3e45a1acc --- /dev/null +++ b/test/testinput/bufr_specifying_subsets.yaml @@ -0,0 +1,57 @@ +# (C) Copyright 2020 NOAA/NWS/NCEP/EMC +# # # +# # # This software is licensed under the terms of the Apache Licence Version 2.0 +# # # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# +observations: + - obs space: + name: bufr + + obsdatain: "./testinput/gdas.t12z.aircft.tm00.bufr_d" + + exports: + subsets: + - NC004001 + - NC004002 + - NC004003 + - NC004006 + - NC004009 + - NC004010 + - NC004011 + + #MetaData + variables: + timestamp: + datetime: + year: "*/YEAR" + month: "*/MNTH" + day: "*/DAYS" + hour: "*/HOUR" + minute: "*/MINU" + latitude: + query: "[*/CLATH, */CLAT]" + longitude: + query: "[*/CLONH, */CLON]" + + ioda: + backend: netcdf + obsdataout: "./testrun/bufr_specifying_subsets.nc" + + #MetaData + variables: + - name: "MetaData/dateTime" + source: variables/timestamp + longName: "Datetime" + units: "seconds since 1970-01-01T00:00:00Z" + + - name: "MetaData/latitude" + source: variables/latitude + longName: "Latitude" + units: "degree_north" + range: [-90, 90] + + - name: "MetaData/longitude" + source: variables/longitude + longName: "Longitude" + units: "degree_east" + range: [-180, 180] diff --git a/test/testinput/gdas.t12z.aircft.tm00.bufr_d b/test/testinput/gdas.t12z.aircft.tm00.bufr_d new file mode 100644 index 000000000..704f6fa51 --- /dev/null +++ b/test/testinput/gdas.t12z.aircft.tm00.bufr_d @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41b1ee8552c02f8015a32fe3eea1708fa3fd39c8bf02bd5cd4a7a42d50c2adf9 +size 123088 diff --git a/test/testoutput/bufr_specifying_subsets.nc b/test/testoutput/bufr_specifying_subsets.nc new file mode 100644 index 000000000..ecb61585f --- /dev/null +++ b/test/testoutput/bufr_specifying_subsets.nc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f3b86786b014bf39de2abaa815688ee982a32aadebe50149ea016acadf5646f +size 24062