diff --git a/velox/type/CMakeLists.txt b/velox/type/CMakeLists.txt index 499cb521005..ea26bcf75b4 100644 --- a/velox/type/CMakeLists.txt +++ b/velox/type/CMakeLists.txt @@ -15,6 +15,7 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() +add_subdirectory(parser) add_subdirectory(tz) add_subdirectory(fbhive) diff --git a/velox/type/parser/CMakeLists.txt b/velox/type/parser/CMakeLists.txt new file mode 100644 index 00000000000..41803204bbd --- /dev/null +++ b/velox/type/parser/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() + +bison_target( + TypeParser TypeParser.yy ${CMAKE_CURRENT_BINARY_DIR}/TypeParser.yy.cc + DEFINES_FILE ${CMAKE_CURRENT_BINARY_DIR}/TypeParser.yy.h) + +flex_target( + TypeParserScanner TypeParser.ll ${CMAKE_CURRENT_BINARY_DIR}/Scanner.cpp + COMPILE_FLAGS "-Cf --prefix=veloxtp") + +add_flex_bison_dependency(TypeParserScanner TypeParser) + +include_directories(${PROJECT_BINARY_DIR}) +include_directories(${FLEX_INCLUDE_DIRS}) +add_library( + velox_type_parser ${BISON_TypeParser_OUTPUTS} + ${FLEX_TypeParserScanner_OUTPUTS} Scanner.h TypeParser.h) +target_link_libraries(velox_type_parser velox_common_base) diff --git a/velox/type/parser/Scanner.h b/velox/type/parser/Scanner.h new file mode 100644 index 00000000000..bc4a820c4c9 --- /dev/null +++ b/velox/type/parser/Scanner.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include "velox/common/base/Exceptions.h" +#include "velox/type/Type.h" + +namespace facebook::velox::type { + +class Scanner : public yyFlexLexer { + public: + Scanner( + std::istream& arg_yyin, + std::ostream& arg_yyout, + TypePtr& outputType, + const std::string_view input) + : yyFlexLexer(&arg_yyin, &arg_yyout), + outputType_(outputType), + input_(input){}; + int lex(Parser::semantic_type* yylval); + + void setType(TypePtr type) { + outputType_ = std::move(type); + } + + // Store input to print it as part of the error message. + std::string_view input() { + return input_; + } + + private: + TypePtr& outputType_; + const std::string_view input_; +}; + +} // namespace facebook::velox::type diff --git a/velox/type/parser/TypeParser.h b/velox/type/parser/TypeParser.h new file mode 100644 index 00000000000..ba5d013d176 --- /dev/null +++ b/velox/type/parser/TypeParser.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "velox/type/Type.h" + +namespace facebook::velox { + +/// Parses a type string in Presto format to Velox type. +/// Example type strings: +/// row(col0 bigint, varchar) +/// array(bigint) +/// map(bigint, array(bigint)) +/// function(bigint,bigint,bigint) +/// The parsing is case-insensitive. i.e. 'Row' and 'row' are equal. +/// Field names for rows are optional. +/// Quoted field names are supported. +/// All custom types need to be registered. An error is thrown otherwise. +/// Types with spaces must be explicitly handled in the parser. +/// Uses the Type::getType API to convert a string to Velox type. +TypePtr parseType(const std::string& typeText); + +} // namespace facebook::velox diff --git a/velox/type/parser/TypeParser.ll b/velox/type/parser/TypeParser.ll new file mode 100644 index 00000000000..b81755244ae --- /dev/null +++ b/velox/type/parser/TypeParser.ll @@ -0,0 +1,78 @@ +%{ +#include +#include + +#include "velox/type/parser/TypeParser.yy.h" // @manual +#include "velox/type/parser/Scanner.h" +#define YY_DECL int facebook::velox::type::Scanner::lex(facebook::velox::type::Parser::semantic_type *yylval) +%} + +%option c++ noyywrap noyylineno nodefault caseless + +A [A|a] +B [B|b] +C [C|c] +D [D|d] +E [E|e] +F [F|f] +G [G|g] +H [H|h] +I [I|i] +J [J|j] +K [K|k] +L [L|l] +M [M|m] +O [O|o] +P [P|p] +R [R|r] +S [S|s] +T [T|t] +U [U|u] +W [W|w] +X [X|x] +Y [Y|y] +Z [Z|z] + +WORD ([[:alpha:][:alnum:]_]*) +QUOTED_ID (['"'][[:alnum:][:space:]_]*['"']) +NUMBER ([[:digit:]]+) +ROW (ROW|STRUCT) +VARIABLE (VARCHAR|VARBINARY) +TYPE_WITH_SPACES ((DOUBLE[ ]PRECISION)|(TIME[ ]WITH[ ]TIME[ ]ZONE)|(TIMESTAMP[ ]WITH[ ]TIME[ ]ZONE)|(INTERVAL[ ]YEAR[ ]TO[ ]MONTH)|(INTERVAL[ ]DAY[ ]TO[ ]SECOND)) + +%% + +"(" return Parser::token::LPAREN; +")" return Parser::token::RPAREN; +"," return Parser::token::COMMA; +(ARRAY) return Parser::token::ARRAY; +(MAP) return Parser::token::MAP; +(FUNCTION) return Parser::token::FUNCTION; +(DECIMAL) return Parser::token::DECIMAL; +{ROW} return Parser::token::ROW; +{VARIABLE} yylval->build(YYText()); return Parser::token::VARIABLE; +{NUMBER} yylval->build(folly::to(YYText())); return Parser::token::NUMBER; +{WORD} yylval->build(YYText()); return Parser::token::WORD; +{TYPE_WITH_SPACES} yylval->build(YYText()); return Parser::token::TYPE_WITH_SPACES; +{QUOTED_ID} yylval->build(YYText()); return Parser::token::QUOTED_ID; +<> return Parser::token::YYEOF; +. /* no action on unmatched input */ + +%% + +int yyFlexLexer::yylex() { + throw std::runtime_error("Bad call to yyFlexLexer::yylex()"); +} + +#include "velox/type/parser/TypeParser.h" + +facebook::velox::TypePtr facebook::velox::parseType(const std::string& typeText) + { + std::istringstream is(typeText); + facebook::velox::TypePtr type; + facebook::velox::type::Scanner scanner{is, std::cerr, type, typeText}; + facebook::velox::type::Parser parser{ &scanner }; + parser.parse(); + VELOX_CHECK(type, "Failed to parse type [{}]", typeText); + return type; +} diff --git a/velox/type/parser/TypeParser.yy b/velox/type/parser/TypeParser.yy new file mode 100644 index 00000000000..573f614e4d0 --- /dev/null +++ b/velox/type/parser/TypeParser.yy @@ -0,0 +1,122 @@ +%{ +#include +#include "velox/common/base/Exceptions.h" +#include "velox/type/Type.h" +%} +%require "3.0.4" +%language "C++" + +%define parser_class_name {Parser} +%define api.namespace {facebook::velox::type} +%define api.value.type variant +%parse-param {Scanner* scanner} +%define parse.error verbose + +%code requires +{ + namespace facebook::velox::type { + class Scanner; + } // namespace facebook::velox::type + namespace facebook::velox { + class Type; + } // namespace facebook::velox + struct RowArguments { + std::vector names; + std::vector> types; + }; +} // %code requires + +%code +{ + #include + #define yylex(x) scanner->lex(x) + using namespace facebook::velox; + TypePtr typeFromString(const std::string& type) { + auto upper = type; + std::transform(upper.begin(), upper.end(), upper.begin(), ::toupper); + if (upper == "INT") { + upper = "INTEGER"; + } else if (upper == "DOUBLE PRECISION") { + upper = "DOUBLE"; + } + auto inferredType = getType(upper, {}); + VELOX_CHECK(inferredType, "Failed to parse type [{}]. Type not registered.", type); + return inferredType; + } +} + +%token LPAREN RPAREN COMMA ARRAY MAP ROW FUNCTION DECIMAL +%token WORD VARIABLE QUOTED_ID TYPE_WITH_SPACES +%token NUMBER +%token YYEOF 0 + +%nterm > type array_type map_type variable_type +%nterm >> named_type +%nterm > row_type function_type decimal_type simple_type +%nterm identifier +%nterm >> type_list +%nterm type_list_opt_names + +%% + +type_spec : named_type { scanner->setType($1.second); } + | type { scanner->setType($1); } + | error { yyerrok; } + ; + +named_type : identifier type { $$ = std::make_pair($1, $2); } + ; + +type : array_type { $$ = $1; } + | map_type { $$ = $1; } + | row_type { $$ = $1; } + | simple_type { $$ = $1; } + | function_type { $$ = $1; } + | variable_type { $$ = $1; } + | decimal_type { $$ = $1; } + ; + +simple_type : WORD { $$ = typeFromString($1); } + | TYPE_WITH_SPACES { $$ = typeFromString($1); } + ; + +variable_type : VARIABLE LPAREN NUMBER RPAREN { $$ = typeFromString($1); } + | VARIABLE { $$ = typeFromString($1); } + ; + +array_type : ARRAY LPAREN type RPAREN { $$ = ARRAY($3); } + ; + +decimal_type : DECIMAL LPAREN NUMBER COMMA NUMBER RPAREN { $$ = DECIMAL($3, $5); } + ; + +type_list : type { $$.push_back($1); } + | type_list COMMA type { $1.push_back($3); $$ = std::move($1); } + ; + +type_list_opt_names : type { $$.names.push_back(""); $$.types.push_back($1); } + | named_type { $$.names.push_back($1.first); $$.types.push_back($1.second); } + | type_list_opt_names COMMA type { $1.names.push_back(""); $1.types.push_back($3); + $$.names = std::move($1.names); $$.types = std::move($1.types); } + | type_list_opt_names COMMA named_type { $1.names.push_back($3.first); $1.types.push_back($3.second); + $$.names = std::move($1.names); $$.types = std::move($1.types); } + ; + +row_type : ROW LPAREN type_list_opt_names RPAREN { $$ = ROW(std::move($3.names), std::move($3.types)); } + ; + +map_type : MAP LPAREN type COMMA type RPAREN { $$ = MAP($3, $5); } + ; + +function_type : FUNCTION LPAREN type_list RPAREN { auto returnType = $3.back(); $3.pop_back(); + $$ = FUNCTION(std::move($3), returnType); } + +identifier : QUOTED_ID { $1.erase(0, 1); $1.pop_back(); $$ = $1; } // Remove the quotes. + | WORD { $$ = $1; } + ; + +%% + +void facebook::velox::type::Parser::error(const std::string& msg) { + VELOX_FAIL("Failed to parse type [{}]", scanner->input()); +} diff --git a/velox/type/parser/tests/CMakeLists.txt b/velox/type/parser/tests/CMakeLists.txt new file mode 100644 index 00000000000..69f4615d3da --- /dev/null +++ b/velox/type/parser/tests/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_executable(velox_type_parser_test TypeParserTest.cpp) + +add_test(NAME velox_type_parser_test COMMAND velox_type_parser_test) + +target_link_libraries(velox_type_parser_test velox_type_parser velox_type gtest + gtest_main gmock) diff --git a/velox/type/parser/tests/TypeParserTest.cpp b/velox/type/parser/tests/TypeParserTest.cpp new file mode 100644 index 00000000000..d725397625c --- /dev/null +++ b/velox/type/parser/tests/TypeParserTest.cpp @@ -0,0 +1,268 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/type/parser/TypeParser.h" + +namespace facebook::velox { +namespace { + +class CustomType : public VarcharType { + public: + CustomType() = default; + + bool equivalent(const Type& other) const override { + // Pointer comparison works since this type is a singleton. + return this == &other; + } +}; + +static const TypePtr& JSON() { + static const TypePtr instance{new CustomType()}; + return instance; +} + +static const TypePtr& TIMESTAMP_WITH_TIME_ZONE() { + static const TypePtr instance{new CustomType()}; + return instance; +} + +static const TypePtr& TIMESTAMP_WITHOUT_TIME_ZONE() { + static const TypePtr instance{new CustomType()}; + return instance; +} + +class TypeFactories : public CustomTypeFactories { + public: + TypeFactories(const TypePtr& type) : type_(type) {} + + TypePtr getType() const override { + return type_; + } + + exec::CastOperatorPtr getCastOperator() const override { + return nullptr; + } + + private: + TypePtr type_; +}; + +class TestTypeSignature : public ::testing::Test { + private: + void SetUp() override { + // Register custom types with and without spaces. + // Does not need any parser support. + registerCustomType("json", std::make_unique(JSON())); + // Needs and has parser support. + registerCustomType( + "timestamp with time zone", + std::make_unique(TIMESTAMP_WITH_TIME_ZONE())); + // Needs and does not have parser support. + registerCustomType( + "timestamp without time zone", + std::make_unique(TIMESTAMP_WITHOUT_TIME_ZONE())); + } +}; + +TEST_F(TestTypeSignature, booleanType) { + ASSERT_EQ(*parseType("boolean"), *BOOLEAN()); +} + +TEST_F(TestTypeSignature, integerType) { + ASSERT_EQ(*parseType("int"), *INTEGER()); + ASSERT_EQ(*parseType("integer"), *INTEGER()); +} + +TEST_F(TestTypeSignature, varcharType) { + ASSERT_EQ(*parseType("varchar"), *VARCHAR()); + ASSERT_EQ(*parseType("varchar(4)"), *VARCHAR()); +} + +TEST_F(TestTypeSignature, varbinary) { + ASSERT_EQ(*parseType("varbinary"), *VARBINARY()); +} + +TEST_F(TestTypeSignature, arrayType) { + ASSERT_EQ(*parseType("array(bigint)"), *ARRAY(BIGINT())); + + ASSERT_EQ(*parseType("array(int)"), *ARRAY(INTEGER())); + ASSERT_EQ(*parseType("array(integer)"), *ARRAY(INTEGER())); + + ASSERT_EQ(*parseType("array(array(bigint))"), *ARRAY(ARRAY(BIGINT()))); + + ASSERT_EQ(*parseType("array(array(int))"), *ARRAY(ARRAY(INTEGER()))); +} + +TEST_F(TestTypeSignature, mapType) { + ASSERT_EQ(*parseType("map(bigint,bigint)"), *MAP(BIGINT(), BIGINT())); + + ASSERT_EQ( + *parseType("map(bigint,array(bigint))"), *MAP(BIGINT(), ARRAY(BIGINT()))); + + ASSERT_EQ( + *parseType("map(bigint,map(bigint,map(varchar,bigint)))"), + *MAP(BIGINT(), MAP(BIGINT(), MAP(VARCHAR(), BIGINT())))); +} + +TEST_F(TestTypeSignature, invalidType) { + VELOX_ASSERT_THROW(parseType("blah()"), "Failed to parse type [blah()]"); + + VELOX_ASSERT_THROW(parseType("array()"), "Failed to parse type [array()]"); + + VELOX_ASSERT_THROW(parseType("map()"), "Failed to parse type [map()]"); + + VELOX_ASSERT_THROW(parseType("x"), "Failed to parse type [x]"); + + // Ensure this is not treated as a row type. + VELOX_ASSERT_THROW( + parseType("rowxxx(a)"), "Failed to parse type [rowxxx(a)]"); +} + +TEST_F(TestTypeSignature, rowType) { + ASSERT_EQ( + *parseType("row(a bigint,b varchar,c real)"), + *ROW({"a", "b", "c"}, {BIGINT(), VARCHAR(), REAL()})); + + ASSERT_EQ( + *parseType("row(a bigint,b array(bigint),c row(a bigint))"), + *ROW( + {"a", "b", "c"}, + {BIGINT(), ARRAY(BIGINT()), ROW({"a"}, {BIGINT()})})); + + ASSERT_EQ( + *parseType("row(\"12 tb\" bigint,b bigint,c bigint)"), + *ROW({"12 tb", "b", "c"}, {BIGINT(), BIGINT(), BIGINT()})); + + ASSERT_EQ( + *parseType("row(a varchar(10),b row(a bigint))"), + *ROW({"a", "b"}, {VARCHAR(), ROW({"a"}, {BIGINT()})})); + + ASSERT_EQ( + *parseType("array(row(col0 bigint,col1 double))"), + *ARRAY(ROW({"col0", "col1"}, {BIGINT(), DOUBLE()}))); + + ASSERT_EQ( + *parseType("row(col0 array(row(col0 bigint,col1 double)))"), + *ROW({"col0"}, {ARRAY(ROW({"col0", "col1"}, {BIGINT(), DOUBLE()}))})); + + ASSERT_EQ(*parseType("row(bigint,varchar)"), *ROW({BIGINT(), VARCHAR()})); + + ASSERT_EQ( + *parseType("row(bigint,array(bigint),row(a bigint))"), + *ROW({BIGINT(), ARRAY(BIGINT()), ROW({"a"}, {BIGINT()})})); + + ASSERT_EQ( + *parseType("row(varchar(10),b row(bigint))"), + *ROW({"", "b"}, {VARCHAR(), ROW({BIGINT()})})); + + ASSERT_EQ( + *parseType("array(row(col0 bigint,double))"), + *ARRAY(ROW({"col0", ""}, {BIGINT(), DOUBLE()}))); + + ASSERT_EQ( + *parseType("row(col0 array(row(bigint,double)))"), + *ROW({"col0"}, {ARRAY(ROW({BIGINT(), DOUBLE()}))})); + + ASSERT_EQ( + *parseType("row(double double precision)"), *ROW({"double"}, {DOUBLE()})); + + ASSERT_EQ(*parseType("row(double precision)"), *ROW({DOUBLE()})); + + ASSERT_EQ( + *parseType("RoW(a bigint,b varchar)"), + *ROW({"a", "b"}, {BIGINT(), VARCHAR()})); + + ASSERT_EQ(*parseType("row(array(Json))"), *ROW({ARRAY(JSON())})); + + VELOX_ASSERT_THROW( + *parseType("row(col0 row(array(HyperLogLog)))"), + "Failed to parse type [HyperLogLog]. Type not registered."); + + // Field type canonicalization. + ASSERT_EQ(*parseType("row(col iNt)"), *ROW({"col"}, {INTEGER()})); +} + +TEST_F(TestTypeSignature, typesWithSpaces) { + // Type is handled by the parser but is not registered. + VELOX_ASSERT_THROW( + parseType("row(time time with time zone)"), + "Failed to parse type [time with time zone]. Type not registered."); + + // Type is not handled by the parser but is registered. + VELOX_ASSERT_THROW( + parseType("row(col0 timestamp without time zone)"), + "Failed to parse type [row(col0 timestamp without time zone)]"); + + ASSERT_EQ( + *parseType("row(double double precision)"), *ROW({"double"}, {DOUBLE()})); + + VELOX_ASSERT_THROW( + parseType("row(time with time zone)"), + "Failed to parse type [time with time zone]"); + + ASSERT_EQ(*parseType("row(double precision)"), *ROW({DOUBLE()})); + + ASSERT_EQ( + *parseType("row(INTERval DAY TO SECOND)"), *ROW({INTERVAL_DAY_TIME()})); + + ASSERT_EQ( + *parseType("row(INTERVAL YEAR TO month)"), *ROW({INTERVAL_YEAR_MONTH()})); + + // quoted field names + ASSERT_EQ( + *parseType( + "row(\"timestamp with time zone\" timestamp with time zone,\"double\" double)"), + *ROW( + {"timestamp with time zone", "double"}, + {TIMESTAMP_WITH_TIME_ZONE(), DOUBLE()})); +} + +TEST_F(TestTypeSignature, intervalYearToMonthType) { + ASSERT_EQ( + *parseType("row(interval interval year to month)"), + *ROW({"interval"}, {INTERVAL_YEAR_MONTH()})); + + ASSERT_EQ( + *parseType("row(interval year to month)"), *ROW({INTERVAL_YEAR_MONTH()})); +} + +TEST_F(TestTypeSignature, functionType) { + ASSERT_EQ( + *parseType("function(bigint,bigint,bigint)"), + *FUNCTION({BIGINT(), BIGINT()}, BIGINT())); + ASSERT_EQ( + *parseType("function(bigint,array(varchar),varchar)"), + *FUNCTION({BIGINT(), ARRAY(VARCHAR())}, VARCHAR())); +} + +TEST_F(TestTypeSignature, decimalType) { + ASSERT_EQ(*parseType("decimal(10, 5)"), *DECIMAL(10, 5)); + ASSERT_EQ(*parseType("decimal(20,10)"), *DECIMAL(20, 10)); + + VELOX_ASSERT_THROW(parseType("decimal"), "Failed to parse type [decimal]"); + VELOX_ASSERT_THROW( + parseType("decimal()"), "Failed to parse type [decimal()]"); + VELOX_ASSERT_THROW( + parseType("decimal(20)"), "Failed to parse type [decimal(20)]"); + VELOX_ASSERT_THROW( + parseType("decimal(, 20)"), "Failed to parse type [decimal(, 20)]"); +} + +} // namespace +} // namespace facebook::velox