Skip to content
33 changes: 33 additions & 0 deletions onnxruntime/contrib_ops/contrib_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,37 @@ Sample echo operator.)DOC");
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput)
.SetDoc(R"DOC(Returns which elements of the input are NaN.)DOC");

ONNX_CONTRIB_OPERATOR_SCHEMA(Tokenizer)

@pranavsharma pranavsharma Dec 5, 2018

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we mention about UTF-8 in the spec just to be super clear? #Closed

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, will do. The author of the spec claimed that all strings in ONNX are UTF-8. Well, they use it for Bing. So there you go.


In reply to: 238894817 [](ancestors = 238894817)

.SetDomain(kMSDomain)
.SinceVersion(1)
.Input(0, "X", "Strings to tokenize", "T")
.Output(0, "Y", "Tokenized strings", "T")
.TypeConstraint(
"T",
{"tensor(string)"},
"Input/Output is a string tensor")
.Attr(
"mark",
"Boolean whether to mark the beginning/end character with start of text character (0x02)/end of text character (0x03).",
AttributeProto::INT)
.Attr(
"pad_value",
"The string used to pad output tensors when the tokens extracted doesn't match the maximum number of tokens found.",
AttributeProto::STRING)
.Attr(
"separators",
"The list of separators, two consecutive segments in X connected by a separator would be divided into two tokens.",
AttributeProto::STRINGS)
.Attr(
"mincharnum",
"Minimum number of characters allowed in the output. For example, if mincharnum is 2, tokens such as �A� and �B� would be ignored",

@pranavsharma pranavsharma Dec 5, 2018

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like there are some non-ascii chars around A and B #Closed

AttributeProto::INT)
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
auto output_elem_type = ctx.getOutputType(0)->mutable_tensor_type();

@pranavsharma pranavsharma Dec 5, 2018

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using propagateElemTypeFromInputToOutput(ctx, 0, 0); for the type. #Closed

output_elem_type->set_elem_type(ONNX_NAMESPACE::TensorProto::STRING);
})
.SetDoc(R"DOC(Tokenizer divides each string in X into a vector of strings along the last axis.)DOC");

// Operators for linear 8 bit quanitzation support.
ONNX_CONTRIB_OPERATOR_SCHEMA(QuantizeLinear)
.SetDomain(kMSDomain)
Expand Down Expand Up @@ -491,6 +522,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, IsNaN);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QuantizeLinear);
Expand All @@ -505,6 +537,7 @@ void RegisterContribKernels(std::function<void(KernelCreateInfo&&)> fn) {
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>());
fn(BuildKernel<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, IsNaN)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QuantizeLinear)>());
Expand Down
Loading