Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/reference/ml/apis/find-file-structure.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ chosen.
structure finder produced its result. The default value is `false`.

`format`::
(string) The high level structure of the file. Valid values are `json`, `xml`,
(string) The high level structure of the file. Valid values are `ndjson`, `xml`,
`delimited`, and `semi_structured_text`. If this parameter is not specified,
the structure finder chooses one.

Expand Down Expand Up @@ -259,7 +259,7 @@ If the request does not encounter errors, you receive the following result:
"sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
"charset" : "UTF-8", <4>
"has_byte_order_marker" : false, <5>
"format" : "json", <6>
"format" : "ndjson", <6>
"need_client_timezone" : false, <7>
"mappings" : { <8>
"author" : {
Expand Down Expand Up @@ -473,14 +473,14 @@ If the request does not encounter errors, you receive the following result:

<1> `num_lines_analyzed` indicates how many lines of the file were analyzed.
<2> `num_messages_analyzed` indicates how many distinct messages the lines contained.
For ND-JSON, this value is the same as `num_lines_analyzed`. For other file
For NDJSON, this value is the same as `num_lines_analyzed`. For other file
formats, messages can span several lines.
<3> `sample_start` reproduces the first two messages in the file verbatim. This
may help to diagnose parse errors or accidental uploads of the wrong file.
<4> `charset` indicates the character encoding used to parse the file.
<5> For UTF character encodings, `has_byte_order_marker` indicates whether the
file begins with a byte order marker.
<6> `format` is one of `json`, `xml`, `delimited` or `semi_structured_text`.
<6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`.
<7> If a timestamp format is detected that does not include a timezone,
`need_client_timezone` will be `true`. The server that parses the file must
therefore be told the correct timezone by the client.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ public class FileStructure implements ToXContentObject, Writeable {

public enum Format {

JSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
NDJSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;

public boolean supportsNesting() {
switch (this) {
case JSON:
case NDJSON:
case XML:
return true;
case DELIMITED:
Expand All @@ -49,7 +49,7 @@ public boolean supportsNesting() {

public boolean isStructured() {
switch (this) {
case JSON:
case NDJSON:
case XML:
case DELIMITED:
return true;
Expand All @@ -62,7 +62,7 @@ public boolean isStructured() {

public boolean isSemiStructured() {
switch (this) {
case JSON:
case NDJSON:
case XML:
case DELIMITED:
return false;
Expand Down Expand Up @@ -645,7 +645,7 @@ public FileStructure build() {
}

switch (format) {
case JSON:
case NDJSON:
if (shouldTrimFields != null) {
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ public void testValidateNonDelimited() {
public void testValidateNonSemiStructuredText() {

FindFileStructureAction.Request request = new FindFileStructureAction.Request();
request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
request.setFormat(randomFrom(FileStructure.Format.NDJSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
request.setGrokPattern(randomAlphaOfLength(80));
request.setSample(new BytesArray("foo\n"));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* Runs the high-level steps needed to create ingest configs for the specified file. In order:
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
* 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
* 3. Determine the most likely file structure - one of NDJSON, XML, delimited or semi-structured text
* 4. Create an appropriate structure object and delegate writing configs to it
*/
public final class FileStructureFinderManager {
Expand Down Expand Up @@ -73,9 +73,9 @@ public final class FileStructureFinderManager {
* These need to be ordered so that the more generic formats come after the more specific ones
*/
private static final List<FileStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
new JsonFileStructureFinderFactory(),
new NdJsonFileStructureFinderFactory(),
new XmlFileStructureFinderFactory(),
// ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
// NDJSON will often also be valid (although utterly weird) CSV, so NDJSON must come before CSV
new DelimitedFileStructureFinderFactory(',', '"', 2, false),
new DelimitedFileStructureFinderFactory('\t', '"', 2, false),
new DelimitedFileStructureFinderFactory(';', '"', 4, false),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;

/**
* Really ND-JSON.
* Newline-delimited JSON.
*/
public class JsonFileStructureFinder implements FileStructureFinder {
public class NdJsonFileStructureFinder implements FileStructureFinder {

private final List<String> sampleMessages;
private final FileStructure structure;

static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) throws IOException {
static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) throws IOException {

List<Map<String, ?>> sampleRecords = new ArrayList<>();

Expand All @@ -43,10 +43,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
sampleMessage);
sampleRecords.add(parser.mapOrdered());
timeoutChecker.check("JSON parsing");
timeoutChecker.check("NDJSON parsing");
}

FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.JSON)
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.NDJSON)
.setCharset(charsetName)
.setHasByteOrderMarker(hasByteOrderMarker)
.setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
Expand Down Expand Up @@ -84,10 +84,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
.setExplanation(explanation)
.build();

return new JsonFileStructureFinder(sampleMessages, structure);
return new NdJsonFileStructureFinder(sampleMessages, structure);
}

private JsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
private NdJsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
this.structure = structure;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@

import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;

public class JsonFileStructureFinderFactory implements FileStructureFinderFactory {
public class NdJsonFileStructureFinderFactory implements FileStructureFinderFactory {

@Override
public boolean canFindFormat(FileStructure.Format format) {
return format == null || format == FileStructure.Format.JSON;
return format == null || format == FileStructure.Format.NDJSON;
}

/**
* This format matches if the sample consists of one or more JSON documents.
* This format matches if the sample consists of one or more NDJSON documents.
* If there is more than one, they must be newline-delimited. The
* documents must be non-empty, to prevent lines containing "{}" from matching.
*/
Expand All @@ -41,35 +41,35 @@ public boolean canCreateFromSample(List<String> explanation, String sample) {
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {

if (parser.map().isEmpty()) {
explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
explanation.add("Not NDJSON because an empty object was parsed: [" + sampleLine + "]");
return false;
}
++completeDocCount;
if (parser.nextToken() != null) {
explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
explanation.add("Not newline delimited NDJSON because a line contained more than a single object: [" +
sampleLine + "]");
return false;
}
}
}
} catch (IOException | IllegalStateException e) {
explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
explanation.add("Not NDJSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
return false;
}

if (completeDocCount == 0) {
explanation.add("Not JSON because sample didn't contain a complete document");
explanation.add("Not NDJSON because sample didn't contain a complete document");
return false;
}

explanation.add("Deciding sample is newline delimited JSON");
explanation.add("Deciding sample is newline delimited NDJSON");
return true;
}

@Override
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
timeoutChecker);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false);
private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true);

// CSV - no need to check JSON or XML because they come earlier in the order we check formats
// CSV - no need to check NDJSON or XML because they come earlier in the order we check formats

public void testCanCreateCsvFromSampleGivenCsv() {

Expand All @@ -39,7 +39,7 @@ public void testCanCreateCsvFromSampleGivenText() {
assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}

// TSV - no need to check JSON, XML or CSV because they come earlier in the order we check formats
// TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats

public void testCanCreateTsvFromSampleGivenTsv() {

Expand All @@ -61,7 +61,7 @@ public void testCanCreateTsvFromSampleGivenText() {
assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}

// Semi-colon delimited - no need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
// Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats

public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {

Expand All @@ -78,7 +78,7 @@ public void testCanCreateSemiColonDelimitedFromSampleGivenText() {
assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}

// Pipe delimited - no need to check JSON, XML, CSV, TSV or semi-colon delimited
// Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited
// values because they come earlier in the order we check formats

public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,20 @@ public void testFindCharsetGivenBinary() throws Exception {
}
}

public void testMakeBestStructureGivenJson() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(JsonFileStructureFinder.class));
public void testMakeBestStructureGivenNdJson() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class));
}

public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception {
public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception {

// Need to change the quote character from the default of double quotes
// otherwise the quotes in the JSON will stop it parsing as CSV
// otherwise the quotes in the NDJSON will stop it parsing as CSV
FileStructureOverrides overrides = FileStructureOverrides.builder()
.setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();

assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
}

public void testMakeBestStructureGivenXml() throws Exception {
Expand All @@ -109,13 +109,13 @@ public void testMakeBestStructureGivenCsv() throws Exception {

public void testMakeBestStructureGivenCsvAndJsonOverride() {

FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build();
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.NDJSON).build();

IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER));

assertEquals("Input did not match the specified format [json]", e.getMessage());
assertEquals("Input did not match the specified format [ndjson]", e.getMessage());
}

public void testMakeBestStructureGivenText() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public abstract class FileStructureTestCase extends ESTestCase {
"2018-05-17T16:23:40,key1,42.0\n" +
"2018-05-17T16:24:11,\"key with spaces\",42.0\n";

protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
protected static final String NDJSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
"\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
"{\"logger\":\"controller\",\"timestamp\":1478261151445," +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@

import java.util.Collections;

public class JsonFileStructureFinderTests extends FileStructureTestCase {
public class NdJsonFileStructureFinderTests extends FileStructureTestCase {

private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();

public void testCreateConfigsGivenGoodJson() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));

String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker,
FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);

FileStructure structure = structureFinder.getStructure();

assertEquals(FileStructure.Format.JSON, structure.getFormat());
assertEquals(FileStructure.Format.NDJSON, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
*/
package org.elasticsearch.xpack.ml.filestructurefinder;

public class JsonFileStructureFinderFactoryTests extends FileStructureTestCase {
public class NdNdJsonFileStructureFinderFactoryTests extends FileStructureTestCase {

private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();

public void testCanCreateFromSampleGivenJson() {
public void testCanCreateFromSampleGivenNdJson() {

assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
}

public void testCanCreateFromSampleGivenXml() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public class TextLogFileStructureFinderFactoryTests extends FileStructureTestCas

private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();

// No need to check JSON, XML, CSV, TSV, semi-colon delimited values or pipe
// No need to check NDJSON, XML, CSV, TSV, semi-colon delimited values or pipe
// delimited values because they come earlier in the order we check formats

public void testCanCreateFromSampleGivenText() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public class XmlFileStructureFinderFactoryTests extends FileStructureTestCase {

private FileStructureFinderFactory factory = new XmlFileStructureFinderFactory();

// No need to check JSON because it comes earlier in the order we check formats
// No need to check NDJSON because it comes earlier in the order we check formats

public void testCanCreateFromSampleGivenXml() {

Expand Down
Loading