Skip to content

Commit b6f1c5a

Browse files
author
David Roberts
committed
[ML] Add a file structure determination endpoint (#33471)
This endpoint accepts an arbitrary file in the request body and attempts to determine the structure. If successful it also proposes mappings that could be used when indexing the file's contents, and calculates simple statistics for each of the fields that are useful in the data preparation step prior to configuring machine learning jobs.
1 parent af95c9f commit b6f1c5a

File tree

16 files changed

+599
-27
lines changed

16 files changed

+599
-27
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import org.elasticsearch.xpack.core.ml.action.DeleteJobAction;
5555
import org.elasticsearch.xpack.core.ml.action.DeleteModelSnapshotAction;
5656
import org.elasticsearch.xpack.core.ml.action.FinalizeJobExecutionAction;
57+
import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
5758
import org.elasticsearch.xpack.core.ml.action.FlushJobAction;
5859
import org.elasticsearch.xpack.core.ml.action.ForecastJobAction;
5960
import org.elasticsearch.xpack.core.ml.action.GetBucketsAction;
@@ -264,6 +265,7 @@ public List<GenericAction> getClientActions() {
264265
GetCalendarEventsAction.INSTANCE,
265266
PostCalendarEventsAction.INSTANCE,
266267
PersistJobAction.INSTANCE,
268+
FindFileStructureAction.INSTANCE,
267269
// security
268270
ClearRealmCacheAction.INSTANCE,
269271
ClearRolesCacheAction.INSTANCE,
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.core.ml.action;
7+
8+
import org.elasticsearch.action.Action;
9+
import org.elasticsearch.action.ActionRequest;
10+
import org.elasticsearch.action.ActionRequestBuilder;
11+
import org.elasticsearch.action.ActionRequestValidationException;
12+
import org.elasticsearch.action.ActionResponse;
13+
import org.elasticsearch.client.ElasticsearchClient;
14+
import org.elasticsearch.common.ParseField;
15+
import org.elasticsearch.common.bytes.BytesReference;
16+
import org.elasticsearch.common.io.stream.StreamInput;
17+
import org.elasticsearch.common.io.stream.StreamOutput;
18+
import org.elasticsearch.common.io.stream.Writeable;
19+
import org.elasticsearch.common.xcontent.StatusToXContentObject;
20+
import org.elasticsearch.common.xcontent.XContentBuilder;
21+
import org.elasticsearch.rest.RestStatus;
22+
import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
23+
24+
import java.io.IOException;
25+
import java.util.Objects;
26+
27+
import static org.elasticsearch.action.ValidateActions.addValidationError;
28+
29+
public class FindFileStructureAction
30+
extends Action<FindFileStructureAction.Request, FindFileStructureAction.Response, FindFileStructureAction.RequestBuilder> {
31+
32+
public static final FindFileStructureAction INSTANCE = new FindFileStructureAction();
33+
public static final String NAME = "cluster:monitor/xpack/ml/findfilestructure";
34+
35+
private FindFileStructureAction() {
36+
super(NAME);
37+
}
38+
39+
@Override
40+
public RequestBuilder newRequestBuilder(ElasticsearchClient client) {
41+
return new RequestBuilder(client, this);
42+
}
43+
44+
@Override
45+
public Response newResponse() {
46+
return new Response();
47+
}
48+
49+
static class RequestBuilder extends ActionRequestBuilder<Request, Response, RequestBuilder> {
50+
51+
RequestBuilder(ElasticsearchClient client, FindFileStructureAction action) {
52+
super(client, action, new Request());
53+
}
54+
}
55+
56+
public static class Response extends ActionResponse implements StatusToXContentObject, Writeable {
57+
58+
private FileStructure fileStructure;
59+
60+
public Response(FileStructure fileStructure) {
61+
this.fileStructure = fileStructure;
62+
}
63+
64+
Response() {
65+
}
66+
67+
public FileStructure getFileStructure() {
68+
return fileStructure;
69+
}
70+
71+
@Override
72+
public void readFrom(StreamInput in) throws IOException {
73+
super.readFrom(in);
74+
fileStructure = new FileStructure(in);
75+
}
76+
77+
@Override
78+
public void writeTo(StreamOutput out) throws IOException {
79+
super.writeTo(out);
80+
fileStructure.writeTo(out);
81+
}
82+
83+
@Override
84+
public RestStatus status() {
85+
return RestStatus.OK;
86+
}
87+
88+
@Override
89+
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
90+
fileStructure.toXContent(builder, params);
91+
return builder;
92+
}
93+
94+
@Override
95+
public int hashCode() {
96+
return Objects.hash(fileStructure);
97+
}
98+
99+
@Override
100+
public boolean equals(Object other) {
101+
102+
if (this == other) {
103+
return true;
104+
}
105+
106+
if (other == null || getClass() != other.getClass()) {
107+
return false;
108+
}
109+
110+
FindFileStructureAction.Response that = (FindFileStructureAction.Response) other;
111+
return Objects.equals(fileStructure, that.fileStructure);
112+
}
113+
}
114+
115+
public static class Request extends ActionRequest {
116+
117+
public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample");
118+
119+
private Integer linesToSample;
120+
private BytesReference sample;
121+
122+
public Request() {
123+
}
124+
125+
public Integer getLinesToSample() {
126+
return linesToSample;
127+
}
128+
129+
public void setLinesToSample(Integer linesToSample) {
130+
this.linesToSample = linesToSample;
131+
}
132+
133+
public BytesReference getSample() {
134+
return sample;
135+
}
136+
137+
public void setSample(BytesReference sample) {
138+
this.sample = sample;
139+
}
140+
141+
@Override
142+
public ActionRequestValidationException validate() {
143+
ActionRequestValidationException validationException = null;
144+
if (linesToSample != null && linesToSample <= 0) {
145+
validationException =
146+
addValidationError(LINES_TO_SAMPLE.getPreferredName() + " must be positive if specified", validationException);
147+
}
148+
if (sample == null || sample.length() == 0) {
149+
validationException = addValidationError("sample must be specified", validationException);
150+
}
151+
return validationException;
152+
}
153+
154+
@Override
155+
public void readFrom(StreamInput in) throws IOException {
156+
super.readFrom(in);
157+
linesToSample = in.readOptionalVInt();
158+
sample = in.readBytesReference();
159+
}
160+
161+
@Override
162+
public void writeTo(StreamOutput out) throws IOException {
163+
super.writeTo(out);
164+
out.writeOptionalVInt(linesToSample);
165+
out.writeBytesReference(sample);
166+
}
167+
168+
@Override
169+
public int hashCode() {
170+
return Objects.hash(linesToSample, sample);
171+
}
172+
173+
@Override
174+
public boolean equals(Object other) {
175+
176+
if (this == other) {
177+
return true;
178+
}
179+
180+
if (other == null || getClass() != other.getClass()) {
181+
return false;
182+
}
183+
184+
Request that = (Request) other;
185+
return Objects.equals(this.linesToSample, that.linesToSample) &&
186+
Objects.equals(this.sample, that.sample);
187+
}
188+
}
189+
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
package org.elasticsearch.xpack.core.ml.filestructurefinder;
77

88
import org.elasticsearch.common.ParseField;
9+
import org.elasticsearch.common.io.stream.StreamInput;
10+
import org.elasticsearch.common.io.stream.StreamOutput;
11+
import org.elasticsearch.common.io.stream.Writeable;
912
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
1013
import org.elasticsearch.common.xcontent.ToXContentObject;
1114
import org.elasticsearch.common.xcontent.XContentBuilder;
@@ -16,7 +19,7 @@
1619
import java.util.Map;
1720
import java.util.Objects;
1821

19-
public class FieldStats implements ToXContentObject {
22+
public class FieldStats implements ToXContentObject, Writeable {
2023

2124
static final ParseField COUNT = new ParseField("count");
2225
static final ParseField CARDINALITY = new ParseField("cardinality");
@@ -64,6 +67,27 @@ public FieldStats(long count, int cardinality, Double minValue, Double maxValue,
6467
this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits);
6568
}
6669

70+
public FieldStats(StreamInput in) throws IOException {
71+
count = in.readVLong();
72+
cardinality = in.readVInt();
73+
minValue = in.readOptionalDouble();
74+
maxValue = in.readOptionalDouble();
75+
meanValue = in.readOptionalDouble();
76+
medianValue = in.readOptionalDouble();
77+
topHits = in.readList(StreamInput::readMap);
78+
}
79+
80+
@Override
81+
public void writeTo(StreamOutput out) throws IOException {
82+
out.writeVLong(count);
83+
out.writeVInt(cardinality);
84+
out.writeOptionalDouble(minValue);
85+
out.writeOptionalDouble(maxValue);
86+
out.writeOptionalDouble(meanValue);
87+
out.writeOptionalDouble(medianValue);
88+
out.writeCollection(topHits, StreamOutput::writeMap);
89+
}
90+
6791
public long getCount() {
6892
return count;
6993
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
package org.elasticsearch.xpack.core.ml.filestructurefinder;
77

88
import org.elasticsearch.common.ParseField;
9+
import org.elasticsearch.common.io.stream.StreamInput;
10+
import org.elasticsearch.common.io.stream.StreamOutput;
11+
import org.elasticsearch.common.io.stream.Writeable;
912
import org.elasticsearch.common.xcontent.ObjectParser;
1013
import org.elasticsearch.common.xcontent.ToXContentObject;
1114
import org.elasticsearch.common.xcontent.XContentBuilder;
@@ -24,7 +27,7 @@
2427
/**
2528
* Stores the file format determined by Machine Learning.
2629
*/
27-
public class FileStructure implements ToXContentObject {
30+
public class FileStructure implements ToXContentObject, Writeable {
2831

2932
public enum Format {
3033

@@ -79,6 +82,8 @@ public String toString() {
7982
}
8083
}
8184

85+
public static final String EXPLAIN = "explain";
86+
8287
static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed");
8388
static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed");
8489
static final ParseField SAMPLE_START = new ParseField("sample_start");
@@ -176,6 +181,66 @@ public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampl
176181
this.explanation = Collections.unmodifiableList(new ArrayList<>(explanation));
177182
}
178183

184+
public FileStructure(StreamInput in) throws IOException {
185+
numLinesAnalyzed = in.readVInt();
186+
numMessagesAnalyzed = in.readVInt();
187+
sampleStart = in.readString();
188+
charset = in.readString();
189+
hasByteOrderMarker = in.readOptionalBoolean();
190+
format = in.readEnum(Format.class);
191+
multilineStartPattern = in.readOptionalString();
192+
excludeLinesPattern = in.readOptionalString();
193+
inputFields = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
194+
hasHeaderRow = in.readOptionalBoolean();
195+
delimiter = in.readBoolean() ? (char) in.readVInt() : null;
196+
shouldTrimFields = in.readOptionalBoolean();
197+
grokPattern = in.readOptionalString();
198+
timestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
199+
timestampField = in.readOptionalString();
200+
needClientTimezone = in.readBoolean();
201+
mappings = Collections.unmodifiableSortedMap(new TreeMap<>(in.readMap()));
202+
fieldStats = Collections.unmodifiableSortedMap(new TreeMap<>(in.readMap(StreamInput::readString, FieldStats::new)));
203+
explanation = Collections.unmodifiableList(in.readList(StreamInput::readString));
204+
}
205+
206+
@Override
207+
public void writeTo(StreamOutput out) throws IOException {
208+
out.writeVInt(numLinesAnalyzed);
209+
out.writeVInt(numMessagesAnalyzed);
210+
out.writeString(sampleStart);
211+
out.writeString(charset);
212+
out.writeOptionalBoolean(hasByteOrderMarker);
213+
out.writeEnum(format);
214+
out.writeOptionalString(multilineStartPattern);
215+
out.writeOptionalString(excludeLinesPattern);
216+
if (inputFields == null) {
217+
out.writeBoolean(false);
218+
} else {
219+
out.writeBoolean(true);
220+
out.writeCollection(inputFields, StreamOutput::writeString);
221+
}
222+
out.writeOptionalBoolean(hasHeaderRow);
223+
if (delimiter == null) {
224+
out.writeBoolean(false);
225+
} else {
226+
out.writeBoolean(true);
227+
out.writeVInt(delimiter);
228+
}
229+
out.writeOptionalBoolean(shouldTrimFields);
230+
out.writeOptionalString(grokPattern);
231+
if (timestampFormats == null) {
232+
out.writeBoolean(false);
233+
} else {
234+
out.writeBoolean(true);
235+
out.writeCollection(timestampFormats, StreamOutput::writeString);
236+
}
237+
out.writeOptionalString(timestampField);
238+
out.writeBoolean(needClientTimezone);
239+
out.writeMap(mappings);
240+
out.writeMap(fieldStats, StreamOutput::writeString, (out1, value) -> value.writeTo(out1));
241+
out.writeCollection(explanation, StreamOutput::writeString);
242+
}
243+
179244
public int getNumLinesAnalyzed() {
180245
return numLinesAnalyzed;
181246
}
@@ -300,7 +365,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
300365
}
301366
builder.endObject();
302367
}
303-
builder.field(EXPLANATION.getPreferredName(), explanation);
368+
if (params.paramAsBoolean(EXPLAIN, false)) {
369+
builder.field(EXPLANATION.getPreferredName(), explanation);
370+
}
304371
builder.endObject();
305372

306373
return builder;

0 commit comments

Comments
 (0)