diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java index 9fda416b33bbe..d10fedfb58975 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java @@ -22,6 +22,9 @@ import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; import java.util.Objects; import static org.elasticsearch.action.ValidateActions.addValidationError; @@ -109,8 +112,32 @@ public boolean equals(Object other) { public static class Request extends ActionRequest { public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample"); + public static final ParseField CHARSET = FileStructure.CHARSET; + public static final ParseField FORMAT = FileStructure.FORMAT; + public static final ParseField COLUMN_NAMES = FileStructure.COLUMN_NAMES; + public static final ParseField HAS_HEADER_ROW = FileStructure.HAS_HEADER_ROW; + public static final ParseField DELIMITER = FileStructure.DELIMITER; + public static final ParseField QUOTE = FileStructure.QUOTE; + public static final ParseField SHOULD_TRIM_FIELDS = FileStructure.SHOULD_TRIM_FIELDS; + public static final ParseField GROK_PATTERN = FileStructure.GROK_PATTERN; + // This one is plural in FileStructure, but singular in FileStructureOverrides + public static final ParseField TIMESTAMP_FORMAT = new ParseField("timestamp_format"); + public static final ParseField TIMESTAMP_FIELD = FileStructure.TIMESTAMP_FIELD; + + private static final String ARG_INCOMPATIBLE_WITH_FORMAT_TEMPLATE = + "[%s] may only be specified if [" + FORMAT.getPreferredName() + "] is [%s]"; private Integer linesToSample; + private String charset; + private FileStructure.Format format; + private List columnNames; + private Boolean hasHeaderRow; + private Character delimiter; + private Character quote; + private Boolean shouldTrimFields; + private String grokPattern; + private String timestampFormat; + private String timestampField; private BytesReference sample; public Request() { @@ -124,6 +151,114 @@ public void setLinesToSample(Integer linesToSample) { this.linesToSample = linesToSample; } + public String getCharset() { + return charset; + } + + public void setCharset(String charset) { + this.charset = (charset == null || charset.isEmpty()) ? null : charset; + } + + public FileStructure.Format getFormat() { + return format; + } + + public void setFormat(FileStructure.Format format) { + this.format = format; + } + + public void setFormat(String format) { + this.format = (format == null || format.isEmpty()) ? null : FileStructure.Format.fromString(format); + } + + public List getColumnNames() { + return columnNames; + } + + public void setColumnNames(List columnNames) { + this.columnNames = (columnNames == null || columnNames.isEmpty()) ? null : columnNames; + } + + public void setColumnNames(String[] columnNames) { + this.columnNames = (columnNames == null || columnNames.length == 0) ? null : Arrays.asList(columnNames); + } + + public Boolean getHasHeaderRow() { + return hasHeaderRow; + } + + public void setHasHeaderRow(Boolean hasHeaderRow) { + this.hasHeaderRow = hasHeaderRow; + } + + public Character getDelimiter() { + return delimiter; + } + + public void setDelimiter(Character delimiter) { + this.delimiter = delimiter; + } + + public void setDelimiter(String delimiter) { + if (delimiter == null || delimiter.isEmpty()) { + this.delimiter = null; + } else if (delimiter.length() == 1) { + this.delimiter = delimiter.charAt(0); + } else { + throw new IllegalArgumentException(DELIMITER.getPreferredName() + " must be a single character"); + } + } + + public Character getQuote() { + return quote; + } + + public void setQuote(Character quote) { + this.quote = quote; + } + + public void setQuote(String quote) { + if (quote == null || quote.isEmpty()) { + this.quote = null; + } else if (quote.length() == 1) { + this.quote = quote.charAt(0); + } else { + throw new IllegalArgumentException(QUOTE.getPreferredName() + " must be a single character"); + } + } + + public Boolean getShouldTrimFields() { + return shouldTrimFields; + } + + public void setShouldTrimFields(Boolean shouldTrimFields) { + this.shouldTrimFields = shouldTrimFields; + } + + public String getGrokPattern() { + return grokPattern; + } + + public void setGrokPattern(String grokPattern) { + this.grokPattern = (grokPattern == null || grokPattern.isEmpty()) ? null : grokPattern; + } + + public String getTimestampFormat() { + return timestampFormat; + } + + public void setTimestampFormat(String timestampFormat) { + this.timestampFormat = (timestampFormat == null || timestampFormat.isEmpty()) ? null : timestampFormat; + } + + public String getTimestampField() { + return timestampField; + } + + public void setTimestampField(String timestampField) { + this.timestampField = (timestampField == null || timestampField.isEmpty()) ? null : timestampField; + } + public BytesReference getSample() { return sample; } @@ -132,12 +267,41 @@ public void setSample(BytesReference sample) { this.sample = sample; } + private static ActionRequestValidationException addIncompatibleArgError(ParseField arg, FileStructure.Format format, + ActionRequestValidationException validationException) { + return addValidationError(String.format(Locale.ROOT, ARG_INCOMPATIBLE_WITH_FORMAT_TEMPLATE, arg.getPreferredName(), format), + validationException); + } + @Override public ActionRequestValidationException validate() { ActionRequestValidationException validationException = null; if (linesToSample != null && linesToSample <= 0) { validationException = - addValidationError(LINES_TO_SAMPLE.getPreferredName() + " must be positive if specified", validationException); + addValidationError("[" + LINES_TO_SAMPLE.getPreferredName() + "] must be positive if specified", validationException); + } + if (format != FileStructure.Format.DELIMITED) { + if (columnNames != null) { + validationException = addIncompatibleArgError(COLUMN_NAMES, FileStructure.Format.DELIMITED, validationException); + } + if (hasHeaderRow != null) { + validationException = addIncompatibleArgError(HAS_HEADER_ROW, FileStructure.Format.DELIMITED, validationException); + } + if (delimiter != null) { + validationException = addIncompatibleArgError(DELIMITER, FileStructure.Format.DELIMITED, validationException); + } + if (quote != null) { + validationException = addIncompatibleArgError(QUOTE, FileStructure.Format.DELIMITED, validationException); + } + if (shouldTrimFields != null) { + validationException = addIncompatibleArgError(SHOULD_TRIM_FIELDS, FileStructure.Format.DELIMITED, validationException); + } + } + if (format != FileStructure.Format.SEMI_STRUCTURED_TEXT) { + if (grokPattern != null) { + validationException = + addIncompatibleArgError(GROK_PATTERN, FileStructure.Format.SEMI_STRUCTURED_TEXT, validationException); + } } if (sample == null || sample.length() == 0) { validationException = addValidationError("sample must be specified", validationException); @@ -149,6 +313,16 @@ public ActionRequestValidationException validate() { public void readFrom(StreamInput in) throws IOException { super.readFrom(in); linesToSample = in.readOptionalVInt(); + charset = in.readOptionalString(); + format = in.readBoolean() ? in.readEnum(FileStructure.Format.class) : null; + columnNames = in.readBoolean() ? in.readList(StreamInput::readString) : null; + hasHeaderRow = in.readOptionalBoolean(); + delimiter = in.readBoolean() ? (char) in.readVInt() : null; + quote = in.readBoolean() ? (char) in.readVInt() : null; + shouldTrimFields = in.readOptionalBoolean(); + grokPattern = in.readOptionalString(); + timestampFormat = in.readOptionalString(); + timestampField = in.readOptionalString(); sample = in.readBytesReference(); } @@ -156,12 +330,43 @@ public void readFrom(StreamInput in) throws IOException { public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); out.writeOptionalVInt(linesToSample); + out.writeOptionalString(charset); + if (format == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeEnum(format); + } + if (columnNames == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeCollection(columnNames, StreamOutput::writeString); + } + out.writeOptionalBoolean(hasHeaderRow); + if (delimiter == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeVInt(delimiter); + } + if (quote == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeVInt(quote); + } + out.writeOptionalBoolean(shouldTrimFields); + out.writeOptionalString(grokPattern); + out.writeOptionalString(timestampFormat); + out.writeOptionalString(timestampField); out.writeBytesReference(sample); } @Override public int hashCode() { - return Objects.hash(linesToSample, sample); + return Objects.hash(linesToSample, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat, + timestampField, sample); } @Override @@ -177,6 +382,14 @@ public boolean equals(Object other) { Request that = (Request) other; return Objects.equals(this.linesToSample, that.linesToSample) && + Objects.equals(this.charset, that.charset) && + Objects.equals(this.format, that.format) && + Objects.equals(this.columnNames, that.columnNames) && + Objects.equals(this.hasHeaderRow, that.hasHeaderRow) && + Objects.equals(this.delimiter, that.delimiter) && + Objects.equals(this.grokPattern, that.grokPattern) && + Objects.equals(this.timestampFormat, that.timestampFormat) && + Objects.equals(this.timestampField, that.timestampField) && Objects.equals(this.sample, that.sample); } } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java index dd508dfb36b74..db5f29f3b1b63 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java @@ -84,25 +84,26 @@ public String toString() { public static final String EXPLAIN = "explain"; - static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed"); - static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed"); - static final ParseField SAMPLE_START = new ParseField("sample_start"); - static final ParseField CHARSET = new ParseField("charset"); - static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker"); - static final ParseField STRUCTURE = new ParseField("format"); - static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern"); - static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern"); - static final ParseField COLUMN_NAMES = new ParseField("column_names"); - static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row"); - static final ParseField DELIMITER = new ParseField("delimiter"); - static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields"); - static final ParseField GROK_PATTERN = new ParseField("grok_pattern"); - static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field"); - static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats"); - static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone"); - static final ParseField MAPPINGS = new ParseField("mappings"); - static final ParseField FIELD_STATS = new ParseField("field_stats"); - static final ParseField EXPLANATION = new ParseField("explanation"); + public static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed"); + public static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed"); + public static final ParseField SAMPLE_START = new ParseField("sample_start"); + public static final ParseField CHARSET = new ParseField("charset"); + public static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker"); + public static final ParseField FORMAT = new ParseField("format"); + public static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern"); + public static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern"); + public static final ParseField COLUMN_NAMES = new ParseField("column_names"); + public static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row"); + public static final ParseField DELIMITER = new ParseField("delimiter"); + public static final ParseField QUOTE = new ParseField("quote"); + public static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields"); + public static final ParseField GROK_PATTERN = new ParseField("grok_pattern"); + public static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field"); + public static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats"); + public static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone"); + public static final ParseField MAPPINGS = new ParseField("mappings"); + public static final ParseField FIELD_STATS = new ParseField("field_stats"); + public static final ParseField EXPLANATION = new ParseField("explanation"); public static final ObjectParser PARSER = new ObjectParser<>("file_structure", false, Builder::new); @@ -112,12 +113,13 @@ public String toString() { PARSER.declareString(Builder::setSampleStart, SAMPLE_START); PARSER.declareString(Builder::setCharset, CHARSET); PARSER.declareBoolean(Builder::setHasByteOrderMarker, HAS_BYTE_ORDER_MARKER); - PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE); + PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), FORMAT); PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN); PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN); PARSER.declareStringArray(Builder::setColumnNames, COLUMN_NAMES); PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW); PARSER.declareString((p, c) -> p.setDelimiter(c.charAt(0)), DELIMITER); + PARSER.declareString((p, c) -> p.setQuote(c.charAt(0)), QUOTE); PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS); PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN); PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD); @@ -145,6 +147,7 @@ public String toString() { private final List columnNames; private final Boolean hasHeaderRow; private final Character delimiter; + private final Character quote; private final Boolean shouldTrimFields; private final String grokPattern; private final List timestampFormats; @@ -156,8 +159,8 @@ public String toString() { public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker, Format format, String multilineStartPattern, String excludeLinesPattern, List columnNames, - Boolean hasHeaderRow, Character delimiter, Boolean shouldTrimFields, String grokPattern, String timestampField, - List timestampFormats, boolean needClientTimezone, Map mappings, + Boolean hasHeaderRow, Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern, + String timestampField, List timestampFormats, boolean needClientTimezone, Map mappings, Map fieldStats, List explanation) { this.numLinesAnalyzed = numLinesAnalyzed; @@ -171,6 +174,7 @@ public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampl this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames)); this.hasHeaderRow = hasHeaderRow; this.delimiter = delimiter; + this.quote = quote; this.shouldTrimFields = shouldTrimFields; this.grokPattern = grokPattern; this.timestampField = timestampField; @@ -193,6 +197,7 @@ public FileStructure(StreamInput in) throws IOException { columnNames = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; hasHeaderRow = in.readOptionalBoolean(); delimiter = in.readBoolean() ? (char) in.readVInt() : null; + quote = in.readBoolean() ? (char) in.readVInt() : null; shouldTrimFields = in.readOptionalBoolean(); grokPattern = in.readOptionalString(); timestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; @@ -226,6 +231,12 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(true); out.writeVInt(delimiter); } + if (quote == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeVInt(quote); + } out.writeOptionalBoolean(shouldTrimFields); out.writeOptionalString(grokPattern); if (timestampFormats == null) { @@ -285,6 +296,10 @@ public Character getDelimiter() { return delimiter; } + public Character getQuote() { + return quote; + } + public Boolean getShouldTrimFields() { return shouldTrimFields; } @@ -328,7 +343,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (hasByteOrderMarker != null) { builder.field(HAS_BYTE_ORDER_MARKER.getPreferredName(), hasByteOrderMarker.booleanValue()); } - builder.field(STRUCTURE.getPreferredName(), format); + builder.field(FORMAT.getPreferredName(), format); if (multilineStartPattern != null && multilineStartPattern.isEmpty() == false) { builder.field(MULTILINE_START_PATTERN.getPreferredName(), multilineStartPattern); } @@ -344,6 +359,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (delimiter != null) { builder.field(DELIMITER.getPreferredName(), String.valueOf(delimiter)); } + if (quote != null) { + builder.field(QUOTE.getPreferredName(), String.valueOf(quote)); + } if (shouldTrimFields != null) { builder.field(SHOULD_TRIM_FIELDS.getPreferredName(), shouldTrimFields.booleanValue()); } @@ -377,8 +395,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws public int hashCode() { return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format, - multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField, - timestampFormats, needClientTimezone, mappings, fieldStats, explanation); + multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, + timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation); } @Override @@ -405,6 +423,7 @@ public boolean equals(Object other) { Objects.equals(this.columnNames, that.columnNames) && Objects.equals(this.hasHeaderRow, that.hasHeaderRow) && Objects.equals(this.delimiter, that.delimiter) && + Objects.equals(this.quote, that.quote) && Objects.equals(this.shouldTrimFields, that.shouldTrimFields) && Objects.equals(this.grokPattern, that.grokPattern) && Objects.equals(this.timestampField, that.timestampField) && @@ -427,6 +446,7 @@ public static class Builder { private List columnNames; private Boolean hasHeaderRow; private Character delimiter; + private Character quote; private Boolean shouldTrimFields; private String grokPattern; private String timestampField; @@ -499,6 +519,11 @@ public Builder setDelimiter(Character delimiter) { return this; } + public Builder setQuote(Character quote) { + this.quote = quote; + return this; + } + public Builder setShouldTrimFields(Boolean shouldTrimFields) { this.shouldTrimFields = shouldTrimFields; return this; @@ -582,6 +607,9 @@ public FileStructure build() { if (delimiter != null) { throw new IllegalArgumentException("Delimiter may not be specified for [" + format + "] structures."); } + if (quote != null) { + throw new IllegalArgumentException("Quote may not be specified for [" + format + "] structures."); + } if (grokPattern != null) { throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures."); } @@ -610,6 +638,9 @@ public FileStructure build() { if (delimiter != null) { throw new IllegalArgumentException("Delimiter may not be specified for [" + format + "] structures."); } + if (quote != null) { + throw new IllegalArgumentException("Quote may not be specified for [" + format + "] structures."); + } if (shouldTrimFields != null) { throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures."); } @@ -638,7 +669,7 @@ public FileStructure build() { } return new FileStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format, - multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, + multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation); } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java index 05ba0e7f306f4..21f11fa5f73c7 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java @@ -8,6 +8,9 @@ import org.elasticsearch.action.ActionRequestValidationException; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.test.AbstractStreamableTestCase; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; + +import java.util.Arrays; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.startsWith; @@ -22,6 +25,44 @@ protected FindFileStructureAction.Request createTestInstance() { if (randomBoolean()) { request.setLinesToSample(randomIntBetween(10, 2000)); } + + if (randomBoolean()) { + request.setCharset(randomAlphaOfLength(10)); + } + + if (randomBoolean()) { + FileStructure.Format format = randomFrom(FileStructure.Format.values()); + request.setFormat(format); + if (format == FileStructure.Format.DELIMITED) { + if (randomBoolean()) { + request.setColumnNames(generateRandomStringArray(10, 15, false, false)); + } + if (randomBoolean()) { + request.setHasHeaderRow(randomBoolean()); + } + if (randomBoolean()) { + request.setDelimiter(randomFrom(',', '\t', ';', '|')); + } + if (randomBoolean()) { + request.setQuote(randomFrom('"', '\'')); + } + if (randomBoolean()) { + request.setShouldTrimFields(randomBoolean()); + } + } else if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT) { + if (randomBoolean()) { + request.setGrokPattern(randomAlphaOfLength(80)); + } + } + } + + if (randomBoolean()) { + request.setTimestampFormat(randomAlphaOfLength(20)); + } + if (randomBoolean()) { + request.setTimestampField(randomAlphaOfLength(15)); + } + request.setSample(new BytesArray(randomByteArrayOfLength(randomIntBetween(1000, 20000)))); return request; @@ -35,13 +76,62 @@ protected FindFileStructureAction.Request createBlankInstance() { public void testValidateLinesToSample() { FindFileStructureAction.Request request = new FindFileStructureAction.Request(); - request.setLinesToSample(randomFrom(-1, 0)); + request.setLinesToSample(randomIntBetween(-1, 0)); + request.setSample(new BytesArray("foo\n")); + + ActionRequestValidationException e = request.validate(); + assertNotNull(e); + assertThat(e.getMessage(), startsWith("Validation Failed: ")); + assertThat(e.getMessage(), containsString(" [lines_to_sample] must be positive if specified")); + } + + public void testValidateNonDelimited() { + + FindFileStructureAction.Request request = new FindFileStructureAction.Request(); + String errorField; + switch (randomIntBetween(0, 4)) { + case 0: + errorField = "column_names"; + request.setColumnNames(Arrays.asList("col1", "col2")); + break; + case 1: + errorField = "has_header_row"; + request.setHasHeaderRow(randomBoolean()); + break; + case 2: + errorField = "delimiter"; + request.setDelimiter(randomFrom(',', '\t', ';', '|')); + break; + case 3: + errorField = "quote"; + request.setQuote(randomFrom('"', '\'')); + break; + case 4: + errorField = "should_trim_fields"; + request.setShouldTrimFields(randomBoolean()); + break; + default: + throw new IllegalStateException("unexpected switch value"); + } + request.setSample(new BytesArray("foo\n")); + + ActionRequestValidationException e = request.validate(); + assertNotNull(e); + assertThat(e.getMessage(), startsWith("Validation Failed: ")); + assertThat(e.getMessage(), containsString(" [" + errorField + "] may only be specified if [format] is [delimited]")); + } + + public void testValidateNonSemiStructuredText() { + + FindFileStructureAction.Request request = new FindFileStructureAction.Request(); + request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED)); + request.setGrokPattern(randomAlphaOfLength(80)); request.setSample(new BytesArray("foo\n")); ActionRequestValidationException e = request.validate(); assertNotNull(e); assertThat(e.getMessage(), startsWith("Validation Failed: ")); - assertThat(e.getMessage(), containsString(" lines_to_sample must be positive if specified")); + assertThat(e.getMessage(), containsString(" [grok_pattern] may only be specified if [format] is [semi_structured_text]")); } public void testValidateSample() { diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java index e09b9e3f91e7a..ac6c647136bd7 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java @@ -54,6 +54,7 @@ public static FileStructure createTestFileStructure() { builder.setColumnNames(Arrays.asList(generateRandomStringArray(10, 10, false, false))); builder.setHasHeaderRow(randomBoolean()); builder.setDelimiter(randomFrom(',', '\t', ';', '|')); + builder.setQuote(randomFrom('"', '\'')); } if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java index 66d07f5111c52..ec37a2b7481f6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java @@ -17,6 +17,7 @@ import org.elasticsearch.xpack.ml.MachineLearning; import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinder; import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinderManager; +import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides; public class TransportFindFileStructureAction extends HandledTransportAction { @@ -49,8 +50,8 @@ private FindFileStructureAction.Response buildFileStructureResponse(FindFileStru FileStructureFinderManager structureFinderManager = new FileStructureFinderManager(); - FileStructureFinder fileStructureFinder = - structureFinderManager.findFileStructure(request.getLinesToSample(), request.getSample().streamInput()); + FileStructureFinder fileStructureFinder = structureFinderManager.findFileStructure(request.getLinesToSample(), + request.getSample().streamInput(), new FileStructureOverrides(request)); return new FindFileStructureAction.Response(fileStructureFinder.getStructure()); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index ba6b590dfc8cd..a103560480d06 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -33,6 +33,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { + private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])"; private static final int MAX_LEVENSHTEIN_COMPARISONS = 100; private final List sampleMessages; @@ -40,21 +41,35 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, CsvPreference csvPreference, - boolean trimFields) throws IOException { + boolean trimFields, FileStructureOverrides overrides) + throws IOException { Tuple>, List> parsed = readRows(sample, csvPreference); List> rows = parsed.v1(); List lineNumbers = parsed.v2(); - Tuple headerInfo = findHeaderFromSample(explanation, rows); + // Even if the column names are overridden we need to know if there's a + // header in the file, as it affects which rows are considered records + Tuple headerInfo = findHeaderFromSample(explanation, rows, overrides); boolean isHeaderInFile = headerInfo.v1(); String[] header = headerInfo.v2(); - // The column names are the header names but with blanks named column1, column2, etc. - String[] columnNames = new String[header.length]; - for (int i = 0; i < header.length; ++i) { - assert header[i] != null; - String rawHeader = trimFields ? header[i].trim() : header[i]; - columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader; + + String[] columnNames; + List overriddenColumnNames = overrides.getColumnNames(); + if (overriddenColumnNames != null) { + if (overriddenColumnNames.size() != header.length) { + throw new IllegalArgumentException("[" + overriddenColumnNames.size() + "] column names were specified [" + + String.join(",", overriddenColumnNames) + "] but there are [" + header.length + "] columns in the sample"); + } + columnNames = overriddenColumnNames.toArray(new String[overriddenColumnNames.size()]); + } else { + // The column names are the header names but with blanks named column1, column2, etc. + columnNames = new String[header.length]; + for (int i = 0; i < header.length; ++i) { + assert header[i] != null; + String rawHeader = trimFields ? header[i].trim() : header[i]; + columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader; + } } List sampleLines = Arrays.asList(sample.split("\n")); @@ -84,13 +99,14 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords); + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); if (timeField != null) { String timeLineRegex = null; StringBuilder builder = new StringBuilder("^"); @@ -98,7 +114,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List "\"?" + column.replace("\"", "\"\"").replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + "\"?") + .map(column -> optQuote + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuote) .collect(Collectors.joining(","))); } @@ -131,7 +150,10 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List mappings = mappingsAndFieldStats.v1(); - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + if (timeField != null) { + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + } if (mappingsAndFieldStats.v2() != null) { structureBuilder.setFieldStats(mappingsAndFieldStats.v2()); @@ -205,45 +227,61 @@ static Tuple>, List> readRows(String sample, CsvPrefe return new Tuple<>(rows, lineNumbers); } - static Tuple findHeaderFromSample(List explanation, List> rows) { + static Tuple findHeaderFromSample(List explanation, List> rows, + FileStructureOverrides overrides) { assert rows.isEmpty() == false; + List overriddenColumnNames = overrides.getColumnNames(); List firstRow = rows.get(0); boolean isHeaderInFile = true; - if (rowContainsDuplicateNonEmptyValues(firstRow)) { - isHeaderInFile = false; - explanation.add("First row contains duplicate values, so assuming it's not a header"); + if (overrides.getHasHeaderRow() != null) { + isHeaderInFile = overrides.getHasHeaderRow(); + if (isHeaderInFile && overriddenColumnNames == null) { + String duplicateValue = findDuplicateNonEmptyValues(firstRow); + if (duplicateValue != null) { + throw new IllegalArgumentException("Sample specified to contain a header row, " + + "but the first row contains duplicate values: [" + duplicateValue + "]"); + } + } + explanation.add("Sample specified to " + (isHeaderInFile ? "contain" : "not contain") + " a header row"); } else { - if (rows.size() < 3) { - explanation.add("Too little data to accurately assess whether header is in sample - guessing it is"); + if (findDuplicateNonEmptyValues(firstRow) != null) { + isHeaderInFile = false; + explanation.add("First row contains duplicate values, so assuming it's not a header"); } else { - isHeaderInFile = isFirstRowUnusual(explanation, rows); + if (rows.size() < 3) { + explanation.add("Too little data to accurately assess whether header is in sample - guessing it is"); + } else { + isHeaderInFile = isFirstRowUnusual(explanation, rows); + } } } + String[] header; if (isHeaderInFile) { // SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us - return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new)); + header = firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new); } else { - String[] dummyHeader = new String[firstRow.size()]; - Arrays.fill(dummyHeader, ""); - return new Tuple<>(false, dummyHeader); + header = new String[firstRow.size()]; + Arrays.fill(header, ""); } + + return new Tuple<>(isHeaderInFile, header); } - static boolean rowContainsDuplicateNonEmptyValues(List row) { + static String findDuplicateNonEmptyValues(List row) { HashSet values = new HashSet<>(); for (String value : row) { if (value != null && value.isEmpty() == false && values.add(value) == false) { - return true; + return value; } } - return false; + return null; } private static boolean isFirstRowUnusual(List explanation, List> rows) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java index 0bbe13e3b05c3..62e5eff517e90 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java @@ -5,6 +5,7 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import org.supercsv.prefs.CsvPreference; import java.io.IOException; @@ -17,12 +18,23 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderF private final int minFieldsPerRow; private final boolean trimFields; - DelimitedFileStructureFinderFactory(char delimiter, int minFieldsPerRow, boolean trimFields) { - csvPreference = new CsvPreference.Builder('"', delimiter, "\n").build(); + DelimitedFileStructureFinderFactory(char delimiter, char quote, int minFieldsPerRow, boolean trimFields) { + csvPreference = new CsvPreference.Builder(quote, delimiter, "\n").build(); this.minFieldsPerRow = minFieldsPerRow; this.trimFields = trimFields; } + DelimitedFileStructureFinderFactory makeSimilar(Character quote, Boolean trimFields) { + + return new DelimitedFileStructureFinderFactory((char) csvPreference.getDelimiterChar(), + (quote == null) ? csvPreference.getQuoteChar() : quote, minFieldsPerRow, (trimFields == null) ? this.trimFields : trimFields); + } + + @Override + public boolean canFindFormat(FileStructure.Format format) { + return format == null || format == FileStructure.Format.DELIMITED; + } + /** * Rules are: * - It must contain at least two complete records @@ -49,9 +61,9 @@ public boolean canCreateFromSample(List explanation, String sample) { } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) - throws IOException { + public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws IOException { return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, - csvPreference, trimFields); + csvPreference, trimFields, overrides); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java index 4b6fce322ee1d..bff4b2115b0fd 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java @@ -5,10 +5,20 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; + import java.util.List; public interface FileStructureFinderFactory { + /** + * Can this factory create a {@link FileStructureFinder} that can find the supplied format? + * @param format The format to query, or null. + * @return true if {@code format} is null or the factory + * can produce a {@link FileStructureFinder} that can find {@code format}. + */ + boolean canFindFormat(FileStructure.Format format); + /** * Given a sample of a file, decide whether this factory will be able * to create an appropriate object to represent its ingestion configs. @@ -27,9 +37,11 @@ public interface FileStructureFinderFactory { * @param sample A sample from the file to be ingested. * @param charsetName The name of the character set in which the sample was provided. * @param hasByteOrderMarker Did the sample have a byte order marker? null means "not relevant". - * @return A file structure object suitable for ingesting the supplied sample. + * @param overrides Stores structure decisions that have been made by the end user, and should + * take precedence over anything the {@link FileStructureFinder} may decide. + * @return A {@link FileStructureFinder} object suitable for determining the structure of the supplied sample. * @throws Exception if something goes wrong during creation. */ - FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) - throws Exception; + FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws Exception; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java index d0ce68aff25c0..7949998d16e01 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java @@ -13,6 +13,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; @@ -24,6 +25,7 @@ import java.util.Locale; import java.util.Optional; import java.util.Set; +import java.util.stream.Collectors; /** * Runs the high-level steps needed to create ingest configs for the specified file. In order: @@ -70,15 +72,19 @@ public final class FileStructureFinderManager { new JsonFileStructureFinderFactory(), new XmlFileStructureFinderFactory(), // ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV - new DelimitedFileStructureFinderFactory(',', 2, false), - new DelimitedFileStructureFinderFactory('\t', 2, false), - new DelimitedFileStructureFinderFactory(';', 4, false), - new DelimitedFileStructureFinderFactory('|', 5, true), + new DelimitedFileStructureFinderFactory(',', '"', 2, false), + new DelimitedFileStructureFinderFactory('\t', '"', 2, false), + new DelimitedFileStructureFinderFactory(';', '"', 4, false), + new DelimitedFileStructureFinderFactory('|', '"', 5, true), new TextLogFileStructureFinderFactory() )); private static final int BUFFER_SIZE = 8192; + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception { + return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES); + } + /** * Given a stream of data from some file, determine its structure. * @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure? @@ -86,24 +92,42 @@ public final class FileStructureFinderManager { * least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read. If null * the value of {@link #DEFAULT_IDEAL_SAMPLE_LINE_COUNT} will be used. * @param fromFile A stream from which the sample will be read. + * @param overrides Aspects of the file structure that are known in advance. These take precedence over + * values determined by structure analysis. An exception will be thrown if the file structure + * is incompatible with an overridden value. * @return A {@link FileStructureFinder} object from which the structure and messages can be queried. * @throws Exception A variety of problems could occur at various stages of the structure finding process. */ - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception { + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides) + throws Exception { return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, - fromFile); + fromFile, overrides); } public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile) throws Exception { + return findFileStructure(new ArrayList<>(), idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES); + } + + public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile, + FileStructureOverrides overrides) throws Exception { - CharsetMatch charsetMatch = findCharset(explanation, fromFile); - String charsetName = charsetMatch.getName(); + String charsetName = overrides.getCharset(); + Reader sampleReader; + if (charsetName != null) { + // Creating the reader will throw if the specified character set does not exist + sampleReader = new InputStreamReader(fromFile, charsetName); + explanation.add("Using specified character encoding [" + charsetName + "]"); + } else { + CharsetMatch charsetMatch = findCharset(explanation, fromFile); + charsetName = charsetMatch.getName(); + sampleReader = charsetMatch.getReader(); + } - Tuple sampleInfo = sampleFile(charsetMatch.getReader(), charsetName, MIN_SAMPLE_LINE_COUNT, + Tuple sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT, Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount)); - return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2()); + return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides); } CharsetMatch findCharset(List explanation, InputStream inputStream) throws Exception { @@ -195,15 +219,44 @@ CharsetMatch findCharset(List explanation, InputStream inputStream) thro (containsZeroBytes ? " - could it be binary data?" : "")); } - FileStructureFinder makeBestStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) - throws Exception { + FileStructureFinder makeBestStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws Exception { - for (FileStructureFinderFactory factory : ORDERED_STRUCTURE_FACTORIES) { + Character delimiter = overrides.getDelimiter(); + Character quote = overrides.getQuote(); + Boolean shouldTrimFields = overrides.getShouldTrimFields(); + List factories; + if (delimiter != null) { + + // If a precise delimiter is specified, we only need one structure finder + // factory, and we'll tolerate as little as one column in the input + factories = Collections.singletonList(new DelimitedFileStructureFinderFactory(delimiter, (quote == null) ? '"' : quote, 1, + (shouldTrimFields == null) ? (delimiter == '|') : shouldTrimFields)); + + } else if (quote != null || shouldTrimFields != null) { + + // The delimiter is not specified, but some other aspect of delimited files is, + // so clone our default delimited factories altering the overridden values + factories = ORDERED_STRUCTURE_FACTORIES.stream().filter(factory -> factory instanceof DelimitedFileStructureFinderFactory) + .map(factory -> ((DelimitedFileStructureFinderFactory) factory).makeSimilar(quote, shouldTrimFields)) + .collect(Collectors.toList()); + + } else { + + // We can use the default factories, but possibly filtered down to a specific format + factories = ORDERED_STRUCTURE_FACTORIES.stream() + .filter(factory -> factory.canFindFormat(overrides.getFormat())).collect(Collectors.toList()); + + } + + for (FileStructureFinderFactory factory : factories) { if (factory.canCreateFromSample(explanation, sample)) { - return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker); + return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides); } } - throw new IllegalArgumentException("Input did not match any known formats"); + + throw new IllegalArgumentException("Input did not match " + + ((overrides.getFormat() == null) ? "any known formats" : "the specified format [" + overrides.getFormat() + "]")); } private Tuple sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException { @@ -233,7 +286,7 @@ private Tuple sampleFile(Reader reader, String charsetName, int } if (lineCount < minLines) { - throw new IllegalArgumentException("Input contained too few lines to sample"); + throw new IllegalArgumentException("Input contained too few lines [" + lineCount + "] to obtain a meaningful sample"); } return new Tuple<>(sample.toString(), hasByteOrderMarker); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java new file mode 100644 index 0000000000000..e30699c69b7f8 --- /dev/null +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java @@ -0,0 +1,205 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.ml.filestructurefinder; + +import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +/** + * An immutable holder for the aspects of file structure detection that can be overridden + * by the end user. Every field can be null, and this means that that + * aspect of the file structure detection is not overridden. + * + * There is no consistency checking in this class. Consistency checking of the different + * fields is done in {@link FindFileStructureAction.Request}. + */ +public class FileStructureOverrides { + + public static final FileStructureOverrides EMPTY_OVERRIDES = new Builder().build(); + + private final String charset; + private final FileStructure.Format format; + private final List columnNames; + private final Boolean hasHeaderRow; + private final Character delimiter; + private final Character quote; + private final Boolean shouldTrimFields; + private final String grokPattern; + private final String timestampFormat; + private final String timestampField; + + public FileStructureOverrides(FindFileStructureAction.Request request) { + + this(request.getCharset(), request.getFormat(), request.getColumnNames(), request.getHasHeaderRow(), request.getDelimiter(), + request.getQuote(), request.getShouldTrimFields(), request.getGrokPattern(), request.getTimestampFormat(), + request.getTimestampField()); + } + + private FileStructureOverrides(String charset, FileStructure.Format format, List columnNames, Boolean hasHeaderRow, + Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern, + String timestampFormat, String timestampField) { + this.charset = charset; + this.format = format; + this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames)); + this.hasHeaderRow = hasHeaderRow; + this.delimiter = delimiter; + this.quote = quote; + this.shouldTrimFields = shouldTrimFields; + this.grokPattern = grokPattern; + this.timestampFormat = timestampFormat; + this.timestampField = timestampField; + } + + public static Builder builder() { + return new Builder(); + } + + public String getCharset() { + return charset; + } + + public FileStructure.Format getFormat() { + return format; + } + + public List getColumnNames() { + return columnNames; + } + + public Boolean getHasHeaderRow() { + return hasHeaderRow; + } + + public Character getDelimiter() { + return delimiter; + } + + public Character getQuote() { + return quote; + } + + public Boolean getShouldTrimFields() { + return shouldTrimFields; + } + + public String getGrokPattern() { + return grokPattern; + } + + public String getTimestampFormat() { + return timestampFormat; + } + + public String getTimestampField() { + return timestampField; + } + + @Override + public int hashCode() { + + return Objects.hash(charset, format, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, timestampFormat, + timestampField); + } + + @Override + public boolean equals(Object other) { + + if (this == other) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + FileStructureOverrides that = (FileStructureOverrides) other; + return Objects.equals(this.charset, that.charset) && + Objects.equals(this.format, that.format) && + Objects.equals(this.columnNames, that.columnNames) && + Objects.equals(this.hasHeaderRow, that.hasHeaderRow) && + Objects.equals(this.delimiter, that.delimiter) && + Objects.equals(this.quote, that.quote) && + Objects.equals(this.shouldTrimFields, that.shouldTrimFields) && + Objects.equals(this.grokPattern, that.grokPattern) && + Objects.equals(this.timestampFormat, that.timestampFormat) && + Objects.equals(this.timestampField, that.timestampField); + } + + public static class Builder { + + private String charset; + private FileStructure.Format format; + private List columnNames; + private Boolean hasHeaderRow; + private Character delimiter; + private Character quote; + private Boolean shouldTrimFields; + private String grokPattern; + private String timestampFormat; + private String timestampField; + + public Builder setCharset(String charset) { + this.charset = charset; + return this; + } + + public Builder setFormat(FileStructure.Format format) { + this.format = format; + return this; + } + + public Builder setColumnNames(List columnNames) { + this.columnNames = columnNames; + return this; + } + + public Builder setHasHeaderRow(Boolean hasHeaderRow) { + this.hasHeaderRow = hasHeaderRow; + return this; + } + + public Builder setDelimiter(Character delimiter) { + this.delimiter = delimiter; + return this; + } + + public Builder setQuote(Character quote) { + this.quote = quote; + return this; + } + + public Builder setShouldTrimFields(Boolean shouldTrimFields) { + this.shouldTrimFields = shouldTrimFields; + return this; + } + + public Builder setGrokPattern(String grokPattern) { + this.grokPattern = grokPattern; + return this; + } + + public Builder setTimestampFormat(String timestampFormat) { + this.timestampFormat = timestampFormat; + return this; + } + + public Builder setTimestampField(String timestampField) { + this.timestampField = timestampField; + return this; + } + + public FileStructureOverrides build() { + + return new FileStructureOverrides(charset, format, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, + timestampFormat, timestampField); + } + } +} diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java index 0341e03a20bc6..66ecee5b311bb 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java @@ -51,29 +51,41 @@ private FileStructureUtils() { * may be non-empty when the method is called, and this method may * append to it. * @param sampleRecords List of records derived from the provided sample. + * @param overrides Aspects of the file structure that are known in advance. These take precedence over + * values determined by structure analysis. An exception will be thrown if the file structure + * is incompatible with an overridden value. * @return A tuple of (field name, timestamp format) if one can be found, or null if * there is no consistent timestamp. */ - static Tuple guessTimestampField(List explanation, List> sampleRecords) { + static Tuple guessTimestampField(List explanation, List> sampleRecords, + FileStructureOverrides overrides) { if (sampleRecords.isEmpty()) { return null; } // Accept the first match from the first sample that is compatible with all the other samples - for (Tuple candidate : findCandidates(explanation, sampleRecords)) { + for (Tuple candidate : findCandidates(explanation, sampleRecords, overrides)) { boolean allGood = true; for (Map sampleRecord : sampleRecords.subList(1, sampleRecords.size())) { Object fieldValue = sampleRecord.get(candidate.v1()); if (fieldValue == null) { + if (overrides.getTimestampField() != null) { + throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() + + "] is not present in record [" + sampleRecord + "]"); + } explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord + "] doesn't have field"); allGood = false; break; } - TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString()); + TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString(), overrides.getTimestampFormat()); if (match == null || match.candidateIndex != candidate.v2().candidateIndex) { + if (overrides.getTimestampFormat() != null) { + throw new IllegalArgumentException("Specified timestamp format [" + overrides.getTimestampFormat() + + "] does not match for record [" + sampleRecord + "]"); + } explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord + "] matches differently: [" + match + "]"); allGood = false; @@ -82,7 +94,8 @@ static Tuple guessTimestampField(List explanatio } if (allGood) { - explanation.add("Guessing timestamp field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]"); + explanation.add(((overrides.getTimestampField() == null) ? "Guessing timestamp" : "Timestamp") + + " field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]"); return candidate; } } @@ -90,23 +103,41 @@ static Tuple guessTimestampField(List explanatio return null; } - private static List> findCandidates(List explanation, List> sampleRecords) { + private static List> findCandidates(List explanation, List> sampleRecords, + FileStructureOverrides overrides) { + + assert sampleRecords.isEmpty() == false; + Map firstRecord = sampleRecords.get(0); + + String onlyConsiderField = overrides.getTimestampField(); + if (onlyConsiderField != null && firstRecord.get(onlyConsiderField) == null) { + throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() + + "] is not present in record [" + firstRecord + "]"); + } List> candidates = new ArrayList<>(); - // Get candidate timestamps from the first sample record - for (Map.Entry entry : sampleRecords.get(0).entrySet()) { - Object value = entry.getValue(); - if (value != null) { - TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString()); - if (match != null) { - Tuple candidate = new Tuple<>(entry.getKey(), match); - candidates.add(candidate); - explanation.add("First sample timestamp match [" + candidate + "]"); + // Get candidate timestamps from the possible field(s) of the first sample record + for (Map.Entry field : firstRecord.entrySet()) { + String fieldName = field.getKey(); + if (onlyConsiderField == null || onlyConsiderField.equals(fieldName)) { + Object value = field.getValue(); + if (value != null) { + TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString(), overrides.getTimestampFormat()); + if (match != null) { + Tuple candidate = new Tuple<>(fieldName, match); + candidates.add(candidate); + explanation.add("First sample timestamp match [" + candidate + "]"); + } } } } + if (candidates.isEmpty() && overrides.getTimestampFormat() != null) { + throw new IllegalArgumentException("Specified timestamp format [" + overrides.getTimestampFormat() + + "] does not match for record [" + firstRecord + "]"); + } + return candidates; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java index 292d0b8e8b305..54be5079c9d2c 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java @@ -48,21 +48,21 @@ public final class GrokPatternCreator { * Grok patterns that are designed to match the whole message, not just a part of it. */ private static final List FULL_MATCH_GROK_PATTERNS = Arrays.asList( - new FullMatchGrokPatternCandidate("BACULA_LOGLINE", "bts"), - new FullMatchGrokPatternCandidate("CATALINALOG", "timestamp"), - new FullMatchGrokPatternCandidate("COMBINEDAPACHELOG", "timestamp"), - new FullMatchGrokPatternCandidate("COMMONAPACHELOG", "timestamp"), - new FullMatchGrokPatternCandidate("ELB_ACCESS_LOG", "timestamp"), - new FullMatchGrokPatternCandidate("HAPROXYHTTP", "syslog_timestamp"), - new FullMatchGrokPatternCandidate("HAPROXYTCP", "syslog_timestamp"), - new FullMatchGrokPatternCandidate("HTTPD20_ERRORLOG", "timestamp"), - new FullMatchGrokPatternCandidate("HTTPD24_ERRORLOG", "timestamp"), - new FullMatchGrokPatternCandidate("NAGIOSLOGLINE", "nagios_epoch"), - new FullMatchGrokPatternCandidate("NETSCREENSESSIONLOG", "date"), - new FullMatchGrokPatternCandidate("RAILS3", "timestamp"), - new FullMatchGrokPatternCandidate("RUBY_LOGGER", "timestamp"), - new FullMatchGrokPatternCandidate("SHOREWALL", "timestamp"), - new FullMatchGrokPatternCandidate("TOMCATLOG", "timestamp") + FullMatchGrokPatternCandidate.fromGrokPatternName("BACULA_LOGLINE", "bts"), + FullMatchGrokPatternCandidate.fromGrokPatternName("CATALINALOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("COMBINEDAPACHELOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("COMMONAPACHELOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("ELB_ACCESS_LOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("HAPROXYHTTP", "syslog_timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("HAPROXYTCP", "syslog_timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("HTTPD20_ERRORLOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("HTTPD24_ERRORLOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("NAGIOSLOGLINE", "nagios_epoch"), + FullMatchGrokPatternCandidate.fromGrokPatternName("NETSCREENSESSIONLOG", "date"), + FullMatchGrokPatternCandidate.fromGrokPatternName("RAILS3", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("RUBY_LOGGER", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("SHOREWALL", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("TOMCATLOG", "timestamp") ); /** @@ -87,7 +87,7 @@ public final class GrokPatternCreator { // Can't use \b as the breaks, because slashes are not "word" characters new ValueOnlyGrokPatternCandidate("PATH", "keyword", "path", "(? explanation, Collection sampleMes /** * This method attempts to find a Grok pattern that will match all of the sample messages in their entirety. * It will also update mappings and field stats if they are non-null. + * @param timestampField If not null then the chosen Grok pattern must use this timestamp field. * @return A tuple of (time field name, Grok string), or null if no suitable Grok pattern was found. */ - public Tuple findFullLineGrokPattern() { + public Tuple findFullLineGrokPattern(String timestampField) { for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) { - if (candidate.matchesAll(sampleMessages)) { - return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats); + if (timestampField == null || timestampField.equals(candidate.getTimeField())) { + if (candidate.matchesAll(sampleMessages)) { + return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats); + } } } return null; } + /** + * This method processes a user-supplied Grok pattern that will match all of the sample messages in their entirety. + * It will also update mappings and field stats if they are non-null. + * @param grokPattern The user supplied Grok pattern. + * @param timestampField The name of the timestamp field within the Grok pattern. + * @throws IllegalArgumentException If the supplied Grok pattern does not match the sample messages. + */ + public void validateFullLineGrokPattern(String grokPattern, String timestampField) { + + FullMatchGrokPatternCandidate candidate = FullMatchGrokPatternCandidate.fromGrokPattern(grokPattern, timestampField); + if (candidate.matchesAll(sampleMessages)) { + candidate.processMatch(explanation, sampleMessages, mappings, fieldStats); + } else { + throw new IllegalArgumentException("Supplied Grok pattern [" + grokPattern + "] does not match sample messages"); + } + } + /** * Build a Grok pattern that will match all of the sample messages in their entirety. * @param seedPatternName A pattern that has already been determined to match some portion of every sample message. @@ -564,14 +584,26 @@ public String processCaptures(Map fieldNameCountStore, Collecti */ static class FullMatchGrokPatternCandidate { - private final String grokString; + private final String grokPattern; private final String timeField; private final Grok grok; - FullMatchGrokPatternCandidate(String grokPatternName, String timeField) { - grokString = "%{" + grokPatternName + "}"; + static FullMatchGrokPatternCandidate fromGrokPatternName(String grokPatternName, String timeField) { + return new FullMatchGrokPatternCandidate("%{" + grokPatternName + "}", timeField); + } + + static FullMatchGrokPatternCandidate fromGrokPattern(String grokPattern, String timeField) { + return new FullMatchGrokPatternCandidate(grokPattern, timeField); + } + + private FullMatchGrokPatternCandidate(String grokPattern, String timeField) { + this.grokPattern = grokPattern; this.timeField = timeField; - grok = new Grok(Grok.getBuiltinPatterns(), grokString); + grok = new Grok(Grok.getBuiltinPatterns(), grokPattern); + } + + public String getTimeField() { + return timeField; } public boolean matchesAll(Collection sampleMessages) { @@ -585,7 +617,7 @@ public boolean matchesAll(Collection sampleMessages) { public Tuple processMatch(List explanation, Collection sampleMessages, Map mappings, Map fieldStats) { - explanation.add("A full message Grok pattern [" + grokString.substring(2, grokString.length() - 1) + "] looks appropriate"); + explanation.add("A full message Grok pattern [" + grokPattern.substring(2, grokPattern.length() - 1) + "] looks appropriate"); if (mappings != null || fieldStats != null) { Map> valuesPerField = new HashMap<>(); @@ -594,41 +626,39 @@ public Tuple processMatch(List explanation, Collection captures = grok.captures(sampleMessage); // If the pattern doesn't match then captures will be null if (captures == null) { - throw new IllegalStateException("[" + grokString + "] does not match snippet [" + sampleMessage + "]"); + throw new IllegalStateException("[" + grokPattern + "] does not match snippet [" + sampleMessage + "]"); } for (Map.Entry capture : captures.entrySet()) { String fieldName = capture.getKey(); String fieldValue = capture.getValue().toString(); - - // Exclude the time field because that will be dropped and replaced with @timestamp - if (fieldName.equals(timeField) == false) { - valuesPerField.compute(fieldName, (k, v) -> { - if (v == null) { - return new ArrayList<>(Collections.singletonList(fieldValue)); - } else { - v.add(fieldValue); - return v; - } - }); - } + valuesPerField.compute(fieldName, (k, v) -> { + if (v == null) { + return new ArrayList<>(Collections.singletonList(fieldValue)); + } else { + v.add(fieldValue); + return v; + } + }); } } for (Map.Entry> valuesForField : valuesPerField.entrySet()) { String fieldName = valuesForField.getKey(); if (mappings != null) { - mappings.put(fieldName, - FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue())); + // Exclude the time field because that will be dropped and replaced with @timestamp + if (fieldName.equals(timeField) == false) { + mappings.put(fieldName, + FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue())); + } } if (fieldStats != null) { - fieldStats.put(fieldName, - FileStructureUtils.calculateFieldStats(valuesForField.getValue())); + fieldStats.put(fieldName, FileStructureUtils.calculateFieldStats(valuesForField.getValue())); } } } - return new Tuple<>(timeField, grokString); + return new Tuple<>(timeField, grokPattern); } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java index a488549bc524b..b20658f872b65 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java @@ -33,7 +33,8 @@ public class JsonFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static JsonFileStructureFinder makeJsonFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker) throws IOException { + Boolean hasByteOrderMarker, FileStructureOverrides overrides) + throws IOException { List> sampleRecords = new ArrayList<>(); @@ -51,7 +52,7 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List explanat .setNumLinesAnalyzed(sampleMessages.size()) .setNumMessagesAnalyzed(sampleRecords.size()); - Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords); + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); if (timeField != null) { structureBuilder.setTimestampField(timeField.v1()) .setTimestampFormats(timeField.v2().dateFormats) @@ -62,7 +63,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List explanat FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords); SortedMap mappings = mappingsAndFieldStats.v1(); - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + if (timeField != null) { + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + } if (mappingsAndFieldStats.v2() != null) { structureBuilder.setFieldStats(mappingsAndFieldStats.v2()); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java index 02be3c1cf19d4..cfeaa222679c0 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java @@ -8,6 +8,7 @@ import org.elasticsearch.common.xcontent.DeprecationHandler; import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import java.io.IOException; import java.io.StringReader; @@ -18,6 +19,11 @@ public class JsonFileStructureFinderFactory implements FileStructureFinderFactory { + @Override + public boolean canFindFormat(FileStructure.Format format) { + return format == null || format == FileStructure.Format.JSON; + } + /** * This format matches if the sample consists of one or more JSON documents. * If there is more than one, they must be newline-delimited. The @@ -61,9 +67,9 @@ DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader( } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) - throws IOException { - return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker); + public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws IOException { + return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides); } private static class ContextPrintingStringReader extends StringReader { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index 95e0a5dc69d6a..e6e445a3ff6b1 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -28,17 +28,19 @@ public class TextLogFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static TextLogFileStructureFinder makeTextLogFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker) { + Boolean hasByteOrderMarker, FileStructureOverrides overrides) { String[] sampleLines = sample.split("\n"); - Tuple> bestTimestamp = mostLikelyTimestamp(sampleLines); + Tuple> bestTimestamp = mostLikelyTimestamp(sampleLines, overrides); if (bestTimestamp == null) { // Is it appropriate to treat a file that is neither structured nor has // a regular pattern of timestamps as a log file? Probably not... - throw new IllegalArgumentException("Could not find a timestamp in the sample provided"); + throw new IllegalArgumentException("Could not find " + + ((overrides.getTimestampFormat() == null) ? "a timestamp" : "the specified timestamp format") + " in the sample provided"); } - explanation.add("Most likely timestamp format is [" + bestTimestamp.v1() + "]"); + explanation.add(((overrides.getTimestampFormat() == null) ? "Most likely timestamp" : "Timestamp") + " format is [" + + bestTimestamp.v1() + "]"); List sampleMessages = new ArrayList<>(); StringBuilder preamble = new StringBuilder(); @@ -86,17 +88,26 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex SortedMap fieldStats = new TreeMap<>(); - // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove - String interimTimestampField; - String grokPattern; GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats); - Tuple timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern(); - if (timestampFieldAndFullMatchGrokPattern != null) { - interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1(); - grokPattern = timestampFieldAndFullMatchGrokPattern.v2(); + // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove + String interimTimestampField = overrides.getTimestampField(); + String grokPattern = overrides.getGrokPattern(); + if (grokPattern != null) { + if (interimTimestampField == null) { + interimTimestampField = "timestamp"; + } + grokPatternCreator.validateFullLineGrokPattern(grokPattern, interimTimestampField); } else { - interimTimestampField = "timestamp"; - grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField); + Tuple timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern(interimTimestampField); + if (timestampFieldAndFullMatchGrokPattern != null) { + interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1(); + grokPattern = timestampFieldAndFullMatchGrokPattern.v2(); + } else { + if (interimTimestampField == null) { + interimTimestampField = "timestamp"; + } + grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField); + } } FileStructure structure = structureBuilder @@ -127,14 +138,14 @@ public FileStructure getStructure() { return structure; } - static Tuple> mostLikelyTimestamp(String[] sampleLines) { + static Tuple> mostLikelyTimestamp(String[] sampleLines, FileStructureOverrides overrides) { Map>> timestampMatches = new LinkedHashMap<>(); int remainingLines = sampleLines.length; double differenceBetweenTwoHighestWeights = 0.0; for (String sampleLine : sampleLines) { - TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine); + TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine, overrides.getTimestampFormat()); if (match != null) { TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern, match.grokPatternName, ""); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java index 5f737eeb9b823..b92b705aaffdf 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java @@ -5,6 +5,8 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; + import java.util.List; import java.util.regex.Pattern; @@ -13,6 +15,11 @@ public class TextLogFileStructureFinderFactory implements FileStructureFinderFac // This works because, by default, dot doesn't match newlines private static final Pattern TWO_NON_BLANK_LINES_PATTERN = Pattern.compile(".\n+."); + @Override + public boolean canFindFormat(FileStructure.Format format) { + return format == null || format == FileStructure.Format.SEMI_STRUCTURED_TEXT; + } + /** * This format matches if the sample contains at least one newline and at least two * non-blank lines. @@ -33,7 +40,9 @@ public boolean canCreateFromSample(List explanation, String sample) { } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) { - return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker); + public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) { + return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, + overrides); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java index 81e490878a007..363b1352a54cb 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java @@ -148,6 +148,16 @@ public static TimestampMatch findFirstMatch(String text) { return findFirstMatch(text, 0); } + /** + * Find the first timestamp format that matches part of the supplied value. + * @param text The value that the returned timestamp format must exist within. + * @param requiredFormat A date format that any returned match must support. + * @return The timestamp format, or null if none matches. + */ + public static TimestampMatch findFirstMatch(String text, String requiredFormat) { + return findFirstMatch(text, 0, requiredFormat); + } + /** * Find the first timestamp format that matches part of the supplied value, * excluding a specified number of candidate formats. @@ -156,26 +166,40 @@ public static TimestampMatch findFirstMatch(String text) { * @return The timestamp format, or null if none matches. */ public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) { + return findFirstMatch(text, ignoreCandidates, null); + } + + /** + * Find the first timestamp format that matches part of the supplied value, + * excluding a specified number of candidate formats. + * @param text The value that the returned timestamp format must exist within. + * @param ignoreCandidates The number of candidate formats to exclude from the search. + * @param requiredFormat A date format that any returned match must support. + * @return The timestamp format, or null if none matches. + */ + public static TimestampMatch findFirstMatch(String text, int ignoreCandidates, String requiredFormat) { Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()]; int index = ignoreCandidates; for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) { - boolean quicklyRuledOut = false; - for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) { - if (quickRuleoutMatches[quickRuleOutIndex] == null) { - quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find(); - } - if (quickRuleoutMatches[quickRuleOutIndex] == false) { - quicklyRuledOut = true; - break; + if (requiredFormat == null || candidate.dateFormats.contains(requiredFormat)) { + boolean quicklyRuledOut = false; + for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) { + if (quickRuleoutMatches[quickRuleOutIndex] == null) { + quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find(); + } + if (quickRuleoutMatches[quickRuleOutIndex] == false) { + quicklyRuledOut = true; + break; + } } - } - if (quicklyRuledOut == false) { - Map captures = candidate.strictSearchGrok.captures(text); - if (captures != null) { - String preface = captures.getOrDefault(PREFACE, "").toString(); - String epilogue = captures.getOrDefault(EPILOGUE, "").toString(); - return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(), - text.length() - epilogue.length()), epilogue); + if (quicklyRuledOut == false) { + Map captures = candidate.strictSearchGrok.captures(text); + if (captures != null) { + String preface = captures.getOrDefault(PREFACE, "").toString(); + String epilogue = captures.getOrDefault(EPILOGUE, "").toString(); + return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(), + text.length() - epilogue.length()), epilogue); + } } } ++index; @@ -192,6 +216,16 @@ public static TimestampMatch findFirstFullMatch(String text) { return findFirstFullMatch(text, 0); } + /** + * Find the best timestamp format for matching an entire field value. + * @param text The value that the returned timestamp format must match in its entirety. + * @param requiredFormat A date format that any returned match must support. + * @return The timestamp format, or null if none matches. + */ + public static TimestampMatch findFirstFullMatch(String text, String requiredFormat) { + return findFirstFullMatch(text, 0, requiredFormat); + } + /** * Find the best timestamp format for matching an entire field value, * excluding a specified number of candidate formats. @@ -200,11 +234,25 @@ public static TimestampMatch findFirstFullMatch(String text) { * @return The timestamp format, or null if none matches. */ public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates) { + return findFirstFullMatch(text, ignoreCandidates, null); + } + + /** + * Find the best timestamp format for matching an entire field value, + * excluding a specified number of candidate formats. + * @param text The value that the returned timestamp format must match in its entirety. + * @param ignoreCandidates The number of candidate formats to exclude from the search. + * @param requiredFormat A date format that any returned match must support. + * @return The timestamp format, or null if none matches. + */ + public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates, String requiredFormat) { int index = ignoreCandidates; for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) { - Map captures = candidate.strictFullMatchGrok.captures(text); - if (captures != null) { - return makeTimestampMatch(candidate, index, "", text, ""); + if (requiredFormat == null || candidate.dateFormats.contains(requiredFormat)) { + Map captures = candidate.strictFullMatchGrok.captures(text); + if (captures != null) { + return makeTimestampMatch(candidate, index, "", text, ""); + } } ++index; } @@ -417,7 +465,7 @@ static final class CandidateTimestampFormat { // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern + "%{GREEDYDATA:" + EPILOGUE + "}"); - this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern); + this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), "^" + strictGrokPattern + "$"); this.standardGrokPatternName = standardGrokPatternName; assert quickRuleOutIndices.stream() .noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size()); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java index 570f36f59c06e..d5e3fba34c972 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java @@ -38,7 +38,7 @@ public class XmlFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static XmlFileStructureFinder makeXmlFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker) + Boolean hasByteOrderMarker, FileStructureOverrides overrides) throws IOException, ParserConfigurationException, SAXException { String messagePrefix; @@ -90,7 +90,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio .setNumMessagesAnalyzed(sampleRecords.size()) .setMultilineStartPattern("^\\s*<" + topLevelTag); - Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords); + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); if (timeField != null) { structureBuilder.setTimestampField(timeField.v1()) .setTimestampFormats(timeField.v2().dateFormats) @@ -110,8 +110,10 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings); SortedMap outerMappings = new TreeMap<>(); outerMappings.put(topLevelTag, secondLevelProperties); - outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + if (timeField != null) { + outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + } FileStructure structure = structureBuilder .setMappings(outerMappings) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java index f8536d1437594..3079f53931db6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java @@ -5,6 +5,7 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; @@ -27,6 +28,11 @@ public XmlFileStructureFinderFactory() { xmlFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE); } + @Override + public boolean canFindFormat(FileStructure.Format format) { + return format == null || format == FileStructure.Format.XML; + } + /** * This format matches if the sample consists of one or more XML documents, * all with the same root element name. If there is more than one document, @@ -115,8 +121,9 @@ public boolean canCreateFromSample(List explanation, String sample) { } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) + public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws IOException, ParserConfigurationException, SAXException { - return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker); + return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java index 83293c7d60efa..316a4b56e4a07 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java @@ -39,6 +39,17 @@ protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient FindFileStructureAction.Request request = new FindFileStructureAction.Request(); request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(), FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT)); + request.setCharset(restRequest.param(FindFileStructureAction.Request.CHARSET.getPreferredName())); + request.setFormat(restRequest.param(FindFileStructureAction.Request.FORMAT.getPreferredName())); + request.setColumnNames(restRequest.paramAsStringArray(FindFileStructureAction.Request.COLUMN_NAMES.getPreferredName(), null)); + request.setHasHeaderRow(restRequest.paramAsBoolean(FindFileStructureAction.Request.HAS_HEADER_ROW.getPreferredName(), null)); + request.setDelimiter(restRequest.param(FindFileStructureAction.Request.DELIMITER.getPreferredName())); + request.setQuote(restRequest.param(FindFileStructureAction.Request.QUOTE.getPreferredName())); + request.setShouldTrimFields(restRequest.paramAsBoolean(FindFileStructureAction.Request.SHOULD_TRIM_FIELDS.getPreferredName(), + null)); + request.setGrokPattern(restRequest.param(FindFileStructureAction.Request.GROK_PATTERN.getPreferredName())); + request.setTimestampFormat(restRequest.param(FindFileStructureAction.Request.TIMESTAMP_FORMAT.getPreferredName())); + request.setTimestampField(restRequest.param(FindFileStructureAction.Request.TIMESTAMP_FIELD.getPreferredName())); if (restRequest.hasContent()) { request.setSample(restRequest.content()); } else { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java index 6bcb827be94d8..53f3a2a4d4ca6 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java @@ -7,10 +7,10 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestCase { - private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', 2, false); - private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', 2, false); - private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', 4, false); - private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', 5, true); + private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false); + private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', '"', 2, false); + private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false); + private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true); // CSV - no need to check JSON or XML because they come earlier in the order we check formats diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 4e692d583918e..decc61a5397a5 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -19,7 +19,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { - private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', 2, false); + private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false); public void testCreateConfigsGivenCompleteCsv() throws Exception { String sample = "time,message\n" + @@ -29,7 +29,8 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -43,6 +44,7 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern()); assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("time", "message"), structure.getColumnNames()); @@ -51,6 +53,76 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); } + public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setColumnNames(Arrays.asList("my_time", "my_message")).build(); + + String sample = "time,message\n" + + "2018-05-17T13:41:23,hello\n" + + "2018-05-17T13:41:32,hello again\n"; + assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern()); + assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertTrue(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("my_time", "my_message"), structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertEquals("my_time", structure.getTimestampField()); + assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + } + + public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Exception { + + // It's obvious the first row really should be a header row, so by overriding + // detection with the wrong choice the results will be completely changed + FileStructureOverrides overrides = FileStructureOverrides.builder().setHasHeaderRow(false).build(); + + String sample = "time,message\n" + + "2018-05-17T13:41:23,hello\n" + + "2018-05-17T13:41:32,hello again\n"; + assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertNull(structure.getExcludeLinesPattern()); + assertNull(structure.getMultilineStartPattern()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertFalse(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("column1", "column2"), structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertNull(structure.getTimestampField()); + assertNull(structure.getTimestampFormats()); + } + public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception { String sample = "message,time,count\n" + "\"hello\n" + @@ -60,7 +132,8 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -74,6 +147,7 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern()); assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("message", "time", "count"), structure.getColumnNames()); @@ -93,7 +167,8 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -110,6 +185,7 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { structure.getExcludeLinesPattern()); assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", @@ -120,6 +196,50 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); } + public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() throws Exception { + + // Default timestamp field is the first field from the start of each row that contains a + // consistent timestamp format, so if we want the second we need an override + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("tpep_dropoff_datetime").build(); + + String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + + "improvement_surcharge,total_amount,,\n" + + "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; + assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + + "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?", + structure.getExcludeLinesPattern()); + assertEquals("^.*?,.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertTrue(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", + "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", + "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertEquals("tpep_dropoff_datetime", structure.getTimestampField()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); + } + public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception { String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + @@ -131,7 +251,8 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -148,6 +269,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce structure.getExcludeLinesPattern()); assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", @@ -158,6 +280,53 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); } + public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNamesOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder() + .setColumnNames(Arrays.asList("my_VendorID", "my_tpep_pickup_datetime", "my_tpep_dropoff_datetime", "my_passenger_count", + "my_trip_distance", "my_RatecodeID", "my_store_and_fwd_flag", "my_PULocationID", "my_DOLocationID", "my_payment_type", + "my_fare_amount", "my_extra", "my_mta_tax", "my_tip_amount", "my_tolls_amount", "my_improvement_surcharge", + "my_total_amount")).build(); + + String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + + "improvement_surcharge,total_amount\n" + + "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; + assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + + "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", + structure.getExcludeLinesPattern()); + assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertTrue(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("my_VendorID", "my_tpep_pickup_datetime", "my_tpep_dropoff_datetime", "my_passenger_count", + "my_trip_distance", "my_RatecodeID", "my_store_and_fwd_flag", "my_PULocationID", "my_DOLocationID", "my_payment_type", + "my_fare_amount", "my_extra", "my_mta_tax", "my_tip_amount", "my_tolls_amount", "my_improvement_surcharge", "my_total_amount"), + structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertEquals("my_tpep_pickup_datetime", structure.getTimestampField()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); + } + public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" + "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" + @@ -166,7 +335,8 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -181,6 +351,7 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { structure.getExcludeLinesPattern()); assertNull(structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getColumnNames()); @@ -197,7 +368,7 @@ public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException { "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; Tuple header = DelimitedFileStructureFinder.findHeaderFromSample(explanation, - DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1()); + DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1(), FileStructureOverrides.EMPTY_OVERRIDES); assertTrue(header.v1()); assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype")); @@ -210,7 +381,8 @@ public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; Tuple header = DelimitedFileStructureFinder.findHeaderFromSample(explanation, - DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1()); + DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1(), + FileStructureOverrides.EMPTY_OVERRIDES); assertFalse(header.v1()); assertThat(header.v2(), arrayContaining("", "", "", "")); @@ -283,12 +455,12 @@ public void testLineHasUnescapedQuote() { public void testRowContainsDuplicateNonEmptyValues() { - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("a"))); - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList(""))); - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "c"))); - assertTrue(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "a"))); - assertTrue(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "b"))); - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "", ""))); - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("", "a", ""))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Collections.singletonList("a"))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Collections.singletonList(""))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "c"))); + assertEquals("a", DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "a"))); + assertEquals("b", DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "b"))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "", ""))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("", "a", ""))); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java index 10e780f1d34c1..00929ff474cce 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java @@ -6,12 +6,14 @@ package org.elasticsearch.xpack.ml.filestructurefinder; import com.ibm.icu.text.CharsetMatch; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import java.io.ByteArrayInputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES; import static org.hamcrest.Matchers.startsWith; import static org.hamcrest.core.IsInstanceOf.instanceOf; @@ -47,26 +49,62 @@ public void testFindCharsetGivenBinary() throws Exception { } public void testMakeBestStructureGivenJson() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, - "{ \"time\": \"2018-05-17T13:41:23\", \"message\": \"hello\" }", StandardCharsets.UTF_8.name(), randomBoolean()), - instanceOf(JsonFileStructureFinder.class)); + assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + EMPTY_OVERRIDES), instanceOf(JsonFileStructureFinder.class)); + } + + public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception { + + // Need to change the quote character from the default of double quotes + // otherwise the quotes in the JSON will stop it parsing as CSV + FileStructureOverrides overrides = FileStructureOverrides.builder() + .setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build(); + + assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + overrides), instanceOf(DelimitedFileStructureFinder.class)); } public void testMakeBestStructureGivenXml() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, - "hello", StandardCharsets.UTF_8.name(), randomBoolean()), - instanceOf(XmlFileStructureFinder.class)); + assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + EMPTY_OVERRIDES), instanceOf(XmlFileStructureFinder.class)); + } + + public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.SEMI_STRUCTURED_TEXT).build(); + + assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + overrides), instanceOf(TextLogFileStructureFinder.class)); } public void testMakeBestStructureGivenCsv() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, "time,message\n" + - "2018-05-17T13:41:23,hello\n", StandardCharsets.UTF_8.name(), randomBoolean()), - instanceOf(DelimitedFileStructureFinder.class)); + assertThat(structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + EMPTY_OVERRIDES), instanceOf(DelimitedFileStructureFinder.class)); + } + + public void testMakeBestStructureGivenCsvAndJsonOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build(); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + overrides)); + + assertEquals("Input did not match the specified format [json]", e.getMessage()); } public void testMakeBestStructureGivenText() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, "[2018-05-17T13:41:23] hello\n" + - "[2018-05-17T13:41:24] hello again\n", StandardCharsets.UTF_8.name(), randomBoolean()), - instanceOf(TextLogFileStructureFinder.class)); + assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + EMPTY_OVERRIDES), instanceOf(TextLogFileStructureFinder.class)); + } + + public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exception { + + // Every line of the text sample has two colons, so colon delimited is possible, just very weird + FileStructureOverrides overrides = FileStructureOverrides.builder() + .setFormat(FileStructure.Format.DELIMITED).setDelimiter(':').build(); + + assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + overrides), instanceOf(DelimitedFileStructureFinder.class)); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index ac8f95670aba8..8dbfb6a8047de 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -17,6 +17,7 @@ import java.util.Map; import java.util.SortedMap; +import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES; import static org.hamcrest.Matchers.contains; public class FileStructureUtilsTests extends FileStructureTestCase { @@ -32,57 +33,106 @@ public void testMoreLikelyGivenKeyword() { assertFalse(FileStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(1, 256))); } - public void testSingleSampleSingleField() { + public void testGuessTimestampGivenSingleSampleSingleField() { Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample)); + FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().dateFormats, contains("ISO8601")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithSameSingleTimeField() { + public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeFieldOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field1").build(); + + Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); + Tuple match = + FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides); + assertNotNull(match); + assertEquals("field1", match.v1()); + assertThat(match.v2().dateFormats, contains("ISO8601")); + assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + } + + public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeFieldOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field2").build(); + + Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides)); + + assertEquals("Specified timestamp field [field2] is not present in record [{field1=2018-05-24T17:28:31,735}]", e.getMessage()); + } + + public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("ISO8601").build(); + + Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); + Tuple match = + FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides); + assertNotNull(match); + assertEquals("field1", match.v1()); + assertThat(match.v2().dateFormats, contains("ISO8601")); + assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + } + + public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("EEE MMM dd HH:mm:ss YYYY").build(); + + Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides)); + + assertEquals("Specified timestamp format [EEE MMM dd HH:mm:ss YYYY] does not match for record [{field1=2018-05-24T17:28:31,735}]", + e.getMessage()); + } + + public void testGuessTimestampGivenSamplesWithSameSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().dateFormats, contains("ISO8601")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithOneSingleTimeFieldDifferentFormat() { + public void testGuessTimestampGivenSamplesWithOneSingleTimeFieldDifferentFormat() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNull(match); } - public void testSamplesWithDifferentSingleTimeField() { + public void testGuessTimestampGivenSamplesWithDifferentSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNull(match); } - public void testSingleSampleManyFieldsOneTimeFormat() { + public void testGuessTimestampGivenSingleSampleManyFieldsOneTimeFormat() { Map sample = new LinkedHashMap<>(); sample.put("foo", "not a time"); sample.put("time", "2018-05-24 17:28:31,735"); sample.put("bar", 42); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample)); + FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithManyFieldsSameSingleTimeFormat() { + public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormat() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time", "2018-05-24 17:28:31,735"); @@ -92,14 +142,14 @@ public void testSamplesWithManyFieldsSameSingleTimeFormat() { sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() { + public void testGuessTimestampGivenSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time", "2018-05-24 17:28:31,735"); @@ -109,11 +159,11 @@ public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() { sample2.put("time", "May 29 2018 11:53:02"); sample2.put("bar", 17); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNull(match); } - public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() { + public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() { Map sample1 = new LinkedHashMap<>(); sample1.put("red_herring", "May 29 2007 11:53:02"); sample1.put("time", "2018-05-24 17:28:31,735"); @@ -123,14 +173,14 @@ public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() { sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() { + public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time", "May 24 2018 17:28:31"); @@ -140,14 +190,14 @@ public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() { sample2.put("time", "May 29 2018 11:53:02"); sample2.put("red_herring", "17"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName); } - public void testSamplesWithManyFieldsInconsistentTimeFields() { + public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentTimeFields() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time1", "May 24 2018 17:28:31"); @@ -157,11 +207,11 @@ public void testSamplesWithManyFieldsInconsistentTimeFields() { sample2.put("time2", "May 29 2018 11:53:02"); sample2.put("bar", 42); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNull(match); } - public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() { + public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsistentTimeFields() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time1", "2018-05-09 17:28:31,735"); @@ -173,7 +223,7 @@ public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() { sample2.put("time3", "Thu, May 10 2018 11:53:02"); sample2.put("bar", 42); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time2", match.v1()); assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java index 858709e2764bb..271e071fc2717 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java @@ -244,8 +244,7 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() { grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); - assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), - mappings.get("extra_timestamp")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), mappings.get("extra_timestamp")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); @@ -273,7 +272,8 @@ public void testFindFullLineGrokPatternGivenApacheCombinedLogs() { Map mappings = new HashMap<>(); GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); - assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern()); + assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), + grokPatternCreator.findFullLineGrokPattern(randomBoolean() ? "timestamp" : null)); assertEquals(10, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth")); @@ -323,4 +323,59 @@ public void testAdjustForPunctuationGivenNoCommonPrefix() { assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString()); assertSame(snippets, adjustedSnippets); } + + public void testValidateFullLineGrokPatternGivenValid() { + + String timestampField = "utc_timestamp"; + String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + + "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + + "%{GREEDYDATA:message}"; + + // Two timestamps: one local, one UTC + Collection sampleMessages = Arrays.asList( + "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp"); + + Map mappings = new HashMap<>(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + + grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField); + assertEquals(9, mappings.size()); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("serial_no")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), mappings.get("local_timestamp")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("user_id")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("host")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("client_ip")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("method")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("program")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("message")); + } + + public void testValidateFullLineGrokPatternGivenInvalid() { + + String timestampField = "utc_timestamp"; + String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + + "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + + "%{GREEDYDATA:message}"; + + Collection sampleMessages = Arrays.asList( + "Sep 8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53", + "Sep 8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53", + "Sep 8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53", + "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53"); + + Map mappings = new HashMap<>(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField)); + + assertEquals("Supplied Grok pattern [" + grokPattern + "] does not match sample messages", e.getMessage()); + } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java index f41868be86286..6856e9a60214b 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java @@ -18,7 +18,8 @@ public void testCreateConfigsGivenGoodJson() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -32,6 +33,7 @@ public void testCreateConfigsGivenGoodJson() throws Exception { assertNull(structure.getExcludeLinesPattern()); assertNull(structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertNull(structure.getGrokPattern()); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java index a23080a827277..5bc40a165117e 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java @@ -15,6 +15,90 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { + private static final String EXCEPTION_TRACE_SAMPLE = + "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " + + "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" + + "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " + + "encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " + + "such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " + + "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " + + "in length; got 49023\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + + "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + + "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + + "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + + "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + + "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + + "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" + + "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" + + "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + + ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + + ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" + + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" + + "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n"; + private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory(); public void testCreateConfigsGivenElasticsearchLog() throws Exception { @@ -22,7 +106,8 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -36,6 +121,7 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { assertNull(structure.getExcludeLinesPattern()); assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern()); @@ -43,6 +129,85 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); } + public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("my_time").build(); + + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertNull(structure.getExcludeLinesPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); + assertNull(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals("\\[%{TIMESTAMP_ISO8601:my_time}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern()); + assertEquals("my_time", structure.getTimestampField()); + assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + } + + public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{TIMESTAMP_ISO8601:timestamp}\\]" + + "\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}").build(); + + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertNull(structure.getExcludeLinesPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); + assertNull(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]" + + "\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", structure.getGrokPattern()); + assertEquals("timestamp", structure.getTimestampField()); + assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + } + + public void testCreateConfigsGivenElasticsearchLogAndImpossibleGrokPatternOverride() { + + // This Grok pattern cannot be matched against the messages in the sample because the fields are in the wrong order + FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{LOGLEVEL:loglevel} *\\]" + + "\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] \\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}").build(); + + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides)); + + assertEquals("Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " + + "\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", e.getMessage()); + } + public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() { for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) { String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern(); @@ -144,97 +309,17 @@ public void testMostLikelyTimestampGivenAllSame() { "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-watcher]\n" + "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n"; - Tuple> mostLikelyMatch = TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n")); + Tuple> mostLikelyMatch = + TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES); assertNotNull(mostLikelyMatch); assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); } public void testMostLikelyTimestampGivenExceptionTrace() { - String sample = "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " + - "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" + - "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " + - "encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " + - "such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " + - "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " + - "in length; got 49023\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + - "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + - "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + - "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + - "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + - "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + - "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" + - "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" + - "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + - ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + - ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" + - "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" + - "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n"; - - Tuple> mostLikelyMatch = TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n")); + + Tuple> mostLikelyMatch = + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES); assertNotNull(mostLikelyMatch); // Even though many lines have a timestamp near the end (in the Lucene version information), @@ -243,4 +328,26 @@ public void testMostLikelyTimestampGivenExceptionTrace() { assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); } + + public void testMostLikelyTimestampGivenExceptionTraceAndTimestampFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("YYYY-MM-dd HH:mm:ss").build(); + + Tuple> mostLikelyMatch = + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides); + assertNotNull(mostLikelyMatch); + + // The override should force the seemingly inferior choice of timestamp + assertEquals(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", + ""), mostLikelyMatch.v1()); + } + + public void testMostLikelyTimestampGivenExceptionTraceAndImpossibleTimestampFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("MMM dd HH:mm:ss").build(); + + Tuple> mostLikelyMatch = + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides); + assertNull(mostLikelyMatch); + } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java index 4bf65ba783572..01c44147b0430 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java @@ -18,7 +18,8 @@ public void testCreateConfigsGivenGoodXml() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -32,6 +33,7 @@ public void testCreateConfigsGivenGoodXml() throws Exception { assertNull(structure.getExcludeLinesPattern()); assertEquals("^\\s*