Skip to content
32 changes: 32 additions & 0 deletions onnxruntime/contrib_ops/contrib_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,36 @@ Sample echo operator.)DOC");
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput)
.SetDoc(R"DOC(Returns which elements of the input are NaN.)DOC");

ONNX_CONTRIB_OPERATOR_SCHEMA(Tokenizer)
Copy link
Contributor

@pranavsharma pranavsharma Dec 5, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we mention about UTF-8 in the spec just to be super clear? #Closed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, will do. The author of the spec claimed that all strings in ONNX are UTF-8. Well, they use it for Bing. So there you go.


In reply to: 238894817 [](ancestors = 238894817)

.SetDomain(kMSDomain)
.SinceVersion(1)
.Input(0, "X", "Strings to tokenize", "T")
.Output(0, "Y", "Tokenized strings", "T")
.TypeConstraint(
"T",
{"tensor(string)"},
"Input/Output is a string tensor")
.Attr(
"mark",
"Boolean whether to mark the beginning/end character with start of text character (0x02)/end of text character (0x03).",
AttributeProto::INT)
.Attr(
"pad_value",
"The string used to pad output tensors when the tokens extracted doesn't match the maximum number of tokens found. If start/end markers are needed, padding will appear outside the markers.",
AttributeProto::STRING)
.Attr(
"separators",
"The list of separators, two consecutive segments in X connected by a separator would be divided into two tokens.",
AttributeProto::STRINGS)
.Attr(
"mincharnum",
"Minimum number of characters allowed in the output. For example, if mincharnum is 2, tokens such as \"A\" and \"B\" would be ignored",
AttributeProto::INT)
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
propagateElemTypeFromInputToOutput(ctx, 0, 0);
})
.SetDoc(R"DOC(Tokenizer divides each string in X into a vector of strings along the last axis. All input strings including attributes are UTF-8 encoded.)DOC");

// Operators for linear 8 bit quanitzation support.
ONNX_CONTRIB_OPERATOR_SCHEMA(QuantizeLinear)
.SetDomain(kMSDomain)
Expand Down Expand Up @@ -491,6 +521,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, IsNaN);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QuantizeLinear);
Expand All @@ -505,6 +536,7 @@ void RegisterContribKernels(std::function<void(KernelCreateInfo&&)> fn) {
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>());
fn(BuildKernel<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, IsNaN)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DequantizeLinear)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear)>());
fn(BuildKernel<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QuantizeLinear)>());
Expand Down
Loading