From 2a04364e3a9327a7cba651627cea9025cf17907b Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 16 Oct 2024 13:24:58 +0200 Subject: [PATCH 01/31] Implement MatchingStats for SQL mode --- .../mode/cluster/WorkerMatchingStats.java | 132 ++++++++ .../cluster/WorkerUpdateMatchingStatsJob.java | 153 +++++++++ .../mode/local/LocalNamespaceHandler.java | 10 +- .../conquery/mode/local/SqlMatchingStats.java | 29 ++ .../mode/local/SqlUpdateMatchingStatsJob.java | 316 ++++++++++++++++++ .../models/config/SqlConnectorConfig.java | 11 + .../datasets/concepts/MatchingStats.java | 126 +------ .../concepts/tree/ConceptTreeNode.java | 9 +- .../specific/UpdateElementMatchingStats.java | 16 +- .../specific/UpdateMatchingStatsMessage.java | 152 +-------- .../models/worker/LocalNamespace.java | 32 +- .../conquery/models/worker/Namespace.java | 19 +- .../dialect/PostgreSqlFunctionProvider.java | 22 +- .../execution/DefaultSqlCDateSetParser.java | 7 +- .../sql/dialect/TestSqlConnectorConfig.java | 2 +- .../tests/MetadataCollectionTest.java | 59 ++-- .../concepts/tree/MatchingStatsTests.java | 98 +++--- .../src/test/resources/shared/icd.test.json | 210 ++++++++++++ .../src/test/resources/shared/kh-content.csv | 11 + 19 files changed, 1045 insertions(+), 369 deletions(-) create mode 100644 backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerMatchingStats.java create mode 100644 backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java create mode 100644 backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java create mode 100644 backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java create mode 100644 backend/src/test/resources/shared/icd.test.json create mode 100644 backend/src/test/resources/shared/kh-content.csv diff --git a/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerMatchingStats.java b/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerMatchingStats.java new file mode 100644 index 0000000000..a5650c4587 --- /dev/null +++ b/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerMatchingStats.java @@ -0,0 +1,132 @@ +package com.bakdata.conquery.mode.cluster; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import com.bakdata.conquery.models.common.daterange.CDateRange; +import com.bakdata.conquery.models.datasets.Column; +import com.bakdata.conquery.models.datasets.Table; +import com.bakdata.conquery.models.datasets.concepts.MatchingStats; +import com.bakdata.conquery.models.events.Bucket; +import com.bakdata.conquery.models.identifiable.ids.specific.WorkerId; +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +@Getter +@Setter +public class WorkerMatchingStats implements MatchingStats { + + private Map entries = new HashMap<>(); + + @JsonIgnore + private transient CDateRange span; + + @JsonIgnore + private transient long numberOfEvents = -1L; + + @JsonIgnore + private transient long numberOfEntities = -1L; + + public long countEvents() { + if (numberOfEvents == -1L) { + synchronized (this) { + if (numberOfEvents == -1L) { + numberOfEvents = entries.values().stream().mapToLong(Entry::getNumberOfEvents).sum(); + } + } + } + return numberOfEvents; + } + + + public long countEntities() { + if (numberOfEntities == -1L) { + synchronized (this) { + if (numberOfEntities == -1L) { + numberOfEntities = entries.values().stream().mapToLong(Entry::getNumberOfEntities).sum(); + } + } + } + return numberOfEntities; + } + + public CDateRange spanEvents() { + if (span == null) { + synchronized (this) { + if (span == null) { + span = entries.values().stream().map(Entry::getSpan).reduce(CDateRange.all(), CDateRange::spanClosed); + } + } + } + return span; + + } + + public void putEntry(WorkerId source, Entry entry) { + synchronized (this) { + entries.put(source, entry); + span = null; + numberOfEntities = -1L; + numberOfEvents = -1L; + } + } + + @Data + @NoArgsConstructor + @AllArgsConstructor + public static class Entry { + private long numberOfEvents; + + @JsonIgnore + private final Set foundEntities = new HashSet<>(); + private long numberOfEntities; + private int minDate = Integer.MAX_VALUE; + private int maxDate = Integer.MIN_VALUE; + + @JsonIgnore + public CDateRange getSpan() { + if (minDate == Integer.MAX_VALUE && maxDate == Integer.MIN_VALUE) { + return null; + } + + return CDateRange.of( + minDate == Integer.MAX_VALUE ? Integer.MIN_VALUE : minDate, + maxDate == Integer.MIN_VALUE ? Integer.MAX_VALUE : maxDate + ); + } + + public void addEvent(Table table, Bucket bucket, int event, String entityForEvent) { + numberOfEvents++; + if (foundEntities.add(entityForEvent)) { + numberOfEntities++; + } + + for (Column c : table.getColumns()) { + if (!c.getType().isDateCompatible()) { + continue; + } + + if (!bucket.has(event, c)) { + continue; + } + + final CDateRange time = bucket.getAsDateRange(event, c); + + if (time.hasUpperBound()) { + maxDate = Math.max(time.getMaxValue(), maxDate); + } + + if (time.hasLowerBound()) { + minDate = Math.min(time.getMinValue(), minDate); + } + } + } + } + +} diff --git a/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java new file mode 100644 index 0000000000..7e3b7e92a4 --- /dev/null +++ b/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java @@ -0,0 +1,153 @@ +package com.bakdata.conquery.mode.cluster; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import com.bakdata.conquery.models.datasets.Table; +import com.bakdata.conquery.models.datasets.concepts.Concept; +import com.bakdata.conquery.models.datasets.concepts.ConceptElement; +import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; +import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; +import com.bakdata.conquery.models.events.Bucket; +import com.bakdata.conquery.models.events.CBlock; +import com.bakdata.conquery.models.identifiable.ids.specific.ConceptElementId; +import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; +import com.bakdata.conquery.models.jobs.Job; +import com.bakdata.conquery.models.messages.namespaces.specific.UpdateElementMatchingStats; +import com.bakdata.conquery.models.worker.Worker; +import com.bakdata.conquery.util.progressreporter.ProgressReporter; +import com.google.common.base.Functions; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@RequiredArgsConstructor +public class WorkerUpdateMatchingStatsJob extends Job { + private final Worker worker; + private final Collection concepts; + + @Override + public void execute() throws Exception { + if (worker.getStorage().getAllCBlocks().findAny().isEmpty()) { + log.debug("Worker {} is empty, skipping.", worker); + return; + } + + final ProgressReporter progressReporter = getProgressReporter(); + progressReporter.setMax(concepts.size()); + + log.info("BEGIN update Matching stats for {} Concepts", concepts.size()); + + final Map> + subJobs = + concepts.stream() + .collect(Collectors.toMap(Functions.identity(), + concept -> CompletableFuture.runAsync(() -> { + final Concept resolved = concept.resolve(); + final Map, WorkerMatchingStats.Entry> matchingStats = new HashMap<>(resolved.countElements()); + + calculateConceptMatches(resolved, matchingStats, worker); + worker.send(new UpdateElementMatchingStats(worker.getInfo().getId(), matchingStats)); + + progressReporter.report(1); + }, worker.getJobsExecutorService()) + )); + + + log.debug("All jobs submitted. Waiting for completion."); + + + final CompletableFuture all = CompletableFuture.allOf(subJobs.values().toArray(CompletableFuture[]::new)); + + do { + try { + all.get(1, TimeUnit.MINUTES); + } + catch (TimeoutException exception) { + // Count unfinished matching stats jobs. + if (log.isDebugEnabled()) { + final long unfinished = subJobs.values().stream().filter(Predicate.not(CompletableFuture::isDone)).count(); + log.debug("{} still waiting for {} tasks", worker.getInfo().getDataset(), unfinished); + } + + // When trace, also log the unfinished jobs. + if (log.isTraceEnabled()) { + subJobs.forEach((concept, future) -> { + if (future.isDone()) { + return; + } + + log.trace("Still waiting for `{}`", concept); + + }); + } + } + } while (!all.isDone()); + + log.debug("DONE collecting matching stats for {}", worker.getInfo().getDataset()); + + } + + @Override + public String getLabel() { + return String.format("Calculate Matching Stats for %s", worker.getInfo().getDataset()); + } + + private static void calculateConceptMatches(Concept concept, Map, WorkerMatchingStats.Entry> results, Worker worker) { + log.debug("BEGIN calculating for `{}`", concept.getId()); + + for (CBlock cBlock : worker.getStorage().getAllCBlocks().toList()) { + + if (!cBlock.getConnector().getConcept().equals(concept.getId())) { + continue; + } + + try { + final Bucket bucket = cBlock.getBucket().resolve(); + final Table table = bucket.getTable().resolve(); + + for (String entity : bucket.entities()) { + + final int entityEnd = bucket.getEntityEnd(entity); + + for (int event = bucket.getEntityStart(entity); event < entityEnd; event++) { + + final int[] localIds = cBlock.getPathToMostSpecificChild(event); + + + if (!(concept instanceof TreeConcept) || localIds == null) { + results.computeIfAbsent(concept.getId(), (ignored) -> new WorkerMatchingStats.Entry()).addEvent(table, bucket, event, entity); + continue; + } + + if (Connector.isNotContained(localIds)) { + continue; + } + + ConceptTreeNode element = ((TreeConcept) concept).getElementByLocalIdPath(localIds); + + while (element != null) { + results.computeIfAbsent(((ConceptElement) element).getId(), (ignored) -> new WorkerMatchingStats.Entry()) + .addEvent(table, bucket, event, entity); + element = element.getParent(); + } + } + } + + } + catch (Exception e) { + log.error("Failed to collect the matching stats for {}", cBlock, e); + } + } + + log.trace("DONE calculating for `{}`", concept.getId()); + } + +} diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/LocalNamespaceHandler.java b/backend/src/main/java/com/bakdata/conquery/mode/local/LocalNamespaceHandler.java index 6a40a74705..47f2d6379c 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/LocalNamespaceHandler.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/LocalNamespaceHandler.java @@ -37,7 +37,11 @@ public class LocalNamespaceHandler implements NamespaceHandler { private final SqlDialectFactory dialectFactory; @Override - public LocalNamespace createNamespace(NamespaceStorage namespaceStorage, MetaStorage metaStorage, DatasetRegistry datasetRegistry, Environment environment) { + public LocalNamespace createNamespace( + NamespaceStorage namespaceStorage, + MetaStorage metaStorage, + DatasetRegistry datasetRegistry, + Environment environment) { NamespaceSetupData namespaceData = NamespaceHandler.createNamespaceSetup(namespaceStorage, config, internalMapperFactory, datasetRegistry, environment); @@ -60,6 +64,10 @@ public LocalNamespace createNamespace(NamespaceStorage namespaceStorage, MetaSto return new LocalNamespace( namespaceData.getPreprocessMapper(), namespaceStorage, + sqlConnectorConfig, + databaseConfig, + sqlDialect, + sqlExecutionService, executionManager, dslContextWrapper, sqlStorageHandler, diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java new file mode 100644 index 0000000000..a47199712b --- /dev/null +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java @@ -0,0 +1,29 @@ +package com.bakdata.conquery.mode.local; + +import com.bakdata.conquery.models.common.daterange.CDateRange; +import com.bakdata.conquery.models.datasets.concepts.MatchingStats; +import lombok.Value; + +@Value +public class SqlMatchingStats implements MatchingStats { + + long numberOfEvents; + long numberOfEntities; + CDateRange span; + + @Override + public long countEvents() { + return numberOfEvents; + } + + @Override + public long countEntities() { + return numberOfEntities; + } + + @Override + public CDateRange spanEvents() { + return span; + } + +} diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java new file mode 100644 index 0000000000..8833b6a1ea --- /dev/null +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -0,0 +1,316 @@ +package com.bakdata.conquery.mode.local; + +import static org.jooq.impl.DSL.*; + +import java.math.BigDecimal; +import java.sql.ResultSet; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.function.BinaryOperator; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.bakdata.conquery.models.common.daterange.CDateRange; +import com.bakdata.conquery.models.config.DatabaseConfig; +import com.bakdata.conquery.models.datasets.concepts.Concept; +import com.bakdata.conquery.models.datasets.concepts.ConceptElement; +import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.datasets.concepts.conditions.CTCondition; +import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeChild; +import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; +import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; +import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; +import com.bakdata.conquery.models.jobs.Job; +import com.bakdata.conquery.sql.conversion.SharedAliases; +import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; +import com.bakdata.conquery.sql.conversion.dialect.SqlFunctionProvider; +import com.bakdata.conquery.sql.conversion.model.ColumnDateRange; +import com.bakdata.conquery.sql.execution.SqlExecutionService; +import com.bakdata.conquery.util.TablePrimaryColumnUtil; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.jooq.Condition; +import org.jooq.DSLContext; +import org.jooq.Field; +import org.jooq.Record; +import org.jooq.Record1; +import org.jooq.Result; +import org.jooq.Select; +import org.jooq.SelectConditionStep; +import org.jooq.SelectJoinStep; +import org.jooq.Table; + +@Slf4j +public class SqlUpdateMatchingStatsJob extends Job { + + private static final String EVENTS_FIELD = "events"; + private static final String EVENTS_TABLE = "events_unioned"; + private static final String PRIMARY_COLUMN_ALIAS = SharedAliases.PRIMARY_COLUMN.getAlias(); + private static final String ENTITIES_TABLE = "entities"; + private static final String VALIDITY_DATE_SELECT = "unioned"; + private static final String VALIDITY_DATES_TABLE = "validity_dates"; + + private final DatabaseConfig databaseConfig; + private final SqlExecutionService executionService; + private final DSLContext dslContext; + private final SqlFunctionProvider functionProvider; + private final Set concepts; + private final ExecutorService executors; + + public SqlUpdateMatchingStatsJob( + DatabaseConfig databaseConfig, + SqlExecutionService executionService, + SqlFunctionProvider functionProvider, + Set concepts, + ExecutorService executors + ) { + this.databaseConfig = databaseConfig; + this.executionService = executionService; + this.dslContext = executionService.getDslContext(); + this.functionProvider = functionProvider; + this.concepts = concepts; + this.executors = executors; + } + + @Override + public String getLabel() { + return "Calculating Matching Stats for %s.".formatted(executionService); + } + + @Override + public void execute() throws Exception { + + log.debug("BEGIN update Matching stats for {} Concepts.", concepts.size()); + + concepts.stream() + .map(ConceptId::resolve) + .filter(SqlUpdateMatchingStatsJob::isTreeConcept) + .flatMap(concept -> collectMatchingStats(concept.getConnectors(), (TreeConcept) concept)) + .map(executors::submit) + .forEach(SqlUpdateMatchingStatsJob::checkForError); + + executors.shutdown(); + while (!executors.awaitTermination(1, TimeUnit.MINUTES)) { + log.debug("Waiting for executors to set matching stats for all concepts..."); + } + + log.debug("DONE collecting matching stats."); + } + + @Override + public void cancel() { + super.cancel(); + executors.shutdownNow(); + } + + private static void checkForError(Future future) { + try { + future.get(); + } + catch (ExecutionException | InterruptedException e) { + log.error("Unknown error while querying SQL matching stats. Cause: \n", e.getCause()); + } + } + + private static boolean isTreeConcept(Concept concept) { + if (!(concept instanceof TreeConcept)) { + log.error("Collecting MatchingStats is currently only supported for TreeConcepts."); + return false; + } + return true; + } + + private Stream> collectMatchingStats(List connectors, ConceptTreeNode treeNode) { + return Stream.concat( + treeNode.getChildren().stream().flatMap(child -> collectMatchingStats(connectors, child)), + Stream.of(new SqlMatchingStatsTask(connectors, (ConceptElement) treeNode)) + ); + } + + /** + * Applies a count(*) on each connector's table, unions these tables and finally calculates the sum() of the count per connector + * to obtain the concept's total event count. + */ + private long collectEventCount(List connectors, Optional childCondition) { + + org.jooq.Table> eventsUnioned = + union(connectors, connector -> createCountEventsQuery(connector, childCondition), Select::unionAll, EVENTS_TABLE); + + SelectJoinStep> eventsQuery = dslContext.select(sum(eventsUnioned.field(EVENTS_FIELD, BigDecimal.class)).as(EVENTS_FIELD)) + .from(eventsUnioned); + + Result result = executionService.fetch(eventsQuery); + try { + BigDecimal events = (BigDecimal) result.getValue(0, EVENTS_FIELD); + return Objects.requireNonNull(events).longValue(); + } + catch (Exception e) { + log.error("Expecting exactly 1 column of numeric type and 1 row in Result when querying for events of a concept node. Error: ", e); + return 0; + } + } + + private SelectConditionStep> createCountEventsQuery(Connector connector, Optional childCondition) { + return dslContext.select(count().as(EVENTS_FIELD)) + .from(table(name(connector.getResolvedTable().getName()))) + .where(toJooqCondition(connector, childCondition)); + } + + /** + * Selects the PIDs for each connector, unions these tables and does a countDistinct(pid) to obtain the concepts total entity count. + */ + private long collectEntityCount(List connectors, Optional childCondition) { + + org.jooq.Table> entitiesUnioned = + union(connectors, connector -> createCountEntitiesQuery(connector, childCondition), Select::union, ENTITIES_TABLE); + + SelectJoinStep> entitiesQuery = + dslContext.select(countDistinct(entitiesUnioned.field(PRIMARY_COLUMN_ALIAS)).as(PRIMARY_COLUMN_ALIAS)) + .from(entitiesUnioned); + + Result result = executionService.fetch(entitiesQuery); + try { + // we will get an Integer as SQL return type of SUM select, but MatchingStats expect a long + Integer value = (Integer) result.getValue(0, PRIMARY_COLUMN_ALIAS); + return Objects.requireNonNull(value).longValue(); + } + catch (Exception e) { + log.error("Expecting exactly 1 column of type Integer and 1 row in Result when querying for events of a concept node. Error: ", e); + return 0; + } + } + + private SelectConditionStep> createCountEntitiesQuery(Connector connector, Optional childCondition) { + Field primaryColumn = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(PRIMARY_COLUMN_ALIAS); + Table connectorTable = table(name(connector.getResolvedTable().getName())); + Condition connectorCondition = toJooqCondition(connector, childCondition); + return dslContext.select(primaryColumn) + .from(connectorTable) + .where(connectorCondition); + } + + /** + * For each connector and each of its validity dates, we select the start and end date, union all these tables and select the min(start) and max(end) + * to obtain the concepts total date span. + * + * @return A {@link CDateRange} with the min and max validity date over all the given connectors. Null, if the given connectors have no validity date at all. + */ + private CDateRange collectDateSpan(List connectors, Optional childCondition) { + + Map> validityDateMap = connectors.stream().collect( + // we create all validity dates with the same alias to union them later + Collectors.toMap(Function.identity(), connector -> createColumnDateRanges(connector, VALIDITY_DATE_SELECT)) + ); + if (validityDateMap.values().stream().allMatch(List::isEmpty)) { + return null; + } + + org.jooq.Table validityDatesUnioned = unionAllValidityDates(validityDateMap, childCondition); + // we just need any of the generated column date ranges to get the name of the unioned field(s) + ColumnDateRange anyOfTheUnionedDates = validityDateMap.get(connectors.get(0)).get(0); + // ensure we have a start and end field (and not a single-column range), because we need to get the min(start) and max(end) + ColumnDateRange dualColumn = functionProvider.toDualColumn(anyOfTheUnionedDates); + // the get the overall min and max + ColumnDateRange minAndMax = ColumnDateRange.of(min(dualColumn.getStart()), max(dualColumn.getEnd())); + // finally, we create the proper string expression which handles possible +/-infinity date values + Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(VALIDITY_DATE_SELECT); + SelectJoinStep> dateSpanQuery = dslContext.select(validityDateExpression) + .from(validityDatesUnioned); + + Result result = executionService.fetch(dateSpanQuery); + try (ResultSet resultSet = result.intoResultSet()) { + + // If no values were encountered this the result is empty: Table might be empty, or condition does not match any node. + if (!resultSet.isBeforeFirst()) { + return null; + } + + resultSet.next(); // we advance to first line of the ResultSet + List dateRange = executionService.getResultSetProcessor().getDateRange(resultSet, 1); + return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : null; + } + catch (Exception e) { + log.error("Expecting exactly 1 column containing a daterange expression when querying for the date span of a concept. Error: ", e); + return null; + } + } + + private List createColumnDateRanges(Connector connector, String alias) { + return connector.getValidityDates().stream() + .map(functionProvider::forValidityDate) + .map(daterange -> daterange.as(alias)) + .toList(); + } + + private org.jooq.Table unionAllValidityDates(Map> validityDateMap, Optional childCondition) { + return validityDateMap.entrySet().stream() + .flatMap(entry -> { + Connector connector = entry.getKey(); + List validityDates = entry.getValue(); + return validityDates.stream().map(columnDateRange -> createValidityDateQuery(columnDateRange, connector, childCondition)); + }) + .reduce((validityDate1, validityDate2) -> (SelectConditionStep) validityDate1.unionAll(validityDate2)) + .orElseThrow(() -> new RuntimeException("Expected at least 1 validity date to be present.")) + .asTable(name(VALIDITY_DATES_TABLE)); + } + + private SelectConditionStep createValidityDateQuery(ColumnDateRange columnDateRange, Connector connector, Optional childCondition) { + return dslContext.select(columnDateRange.toFields()) + .from(table(name(connector.getResolvedTable().getName()))) + .where(toJooqCondition(connector, childCondition)); + } + + private static org.jooq.Table union( + Collection input, + Function> mapper, + BinaryOperator> operator, + String tableName + ) { + return input.stream() + .map(mapper) + .reduce(operator) + .orElseThrow(() -> new IllegalStateException("Expected at least one element to union")) + .asTable(name(tableName)); + } + + private Condition toJooqCondition(Connector connector, Optional childCondition) { + CTConditionContext context = CTConditionContext.create(connector, functionProvider); + return childCondition.or(() -> Optional.ofNullable(connector.getCondition())) + .map(condition -> condition.convertToSqlCondition(context).condition()) + .orElse(noCondition()); + } + + @RequiredArgsConstructor + private class SqlMatchingStatsTask implements Callable { + + private final List connectors; + private final ConceptElement treeNode; + + @Override + public Void call() { + Optional childCondition = treeNode instanceof ConceptTreeChild treeChild + ? Optional.of(treeChild.getCondition()) + : Optional.empty(); + + long events = collectEventCount(connectors, childCondition); + long entities = collectEntityCount(connectors, childCondition); + CDateRange span = collectDateSpan(connectors, childCondition); + + SqlMatchingStats matchingStats = new SqlMatchingStats(events, entities, span); + treeNode.setMatchingStats(matchingStats); + + return null; + } + } + +} diff --git a/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java b/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java index ef66c3ca8a..e724d44400 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java +++ b/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java @@ -2,7 +2,9 @@ import java.util.Map; import jakarta.validation.Valid; +import jakarta.validation.constraints.Min; +import com.bakdata.conquery.mode.local.SqlUpdateMatchingStatsJob; import com.bakdata.conquery.models.datasets.Dataset; import com.fasterxml.jackson.annotation.JsonIgnore; import io.dropwizard.util.Duration; @@ -27,6 +29,8 @@ @AllArgsConstructor public class SqlConnectorConfig { + private static final int DEFAULT_BACKGROUND_THREADS = 1; + private boolean enabled; /** @@ -34,6 +38,13 @@ public class SqlConnectorConfig { */ private boolean withPrettyPrinting; + /** + * The amount of threads for background tasks like calculating matching stats {@link SqlUpdateMatchingStatsJob}. + */ + @Min(1) + @Builder.Default + private int backgroundThreads = DEFAULT_BACKGROUND_THREADS; + /** * Keys must match the name of existing {@link Dataset}s. */ diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java index 293d845f7d..d3590f389b 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java @@ -1,130 +1,16 @@ package com.bakdata.conquery.models.datasets.concepts; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import javax.annotation.Nullable; import com.bakdata.conquery.models.common.daterange.CDateRange; -import com.bakdata.conquery.models.datasets.Column; -import com.bakdata.conquery.models.datasets.Table; -import com.bakdata.conquery.models.events.Bucket; -import com.bakdata.conquery.models.identifiable.ids.specific.WorkerId; -import com.fasterxml.jackson.annotation.JsonIgnore; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -@Getter -@Setter -public class MatchingStats { +public interface MatchingStats { - private Map entries = new HashMap<>(); - @JsonIgnore - private transient CDateRange span; + long countEvents(); - @JsonIgnore - private transient long numberOfEvents = -1L; + long countEntities(); - @JsonIgnore - private transient long numberOfEntities = -1L; - - public long countEvents() { - if (numberOfEvents == -1L) { - synchronized (this) { - if (numberOfEvents == -1L) { - numberOfEvents = entries.values().stream().mapToLong(Entry::getNumberOfEvents).sum(); - } - } - } - return numberOfEvents; - } - - - public long countEntities() { - if (numberOfEntities == -1L) { - synchronized (this) { - if (numberOfEntities == -1L) { - numberOfEntities = entries.values().stream().mapToLong(Entry::getNumberOfEntities).sum(); - } - } - } - return numberOfEntities; - } - - public CDateRange spanEvents() { - if (span == null) { - synchronized (this) { - if (span == null) { - span = entries.values().stream().map(Entry::getSpan).reduce(CDateRange.all(), CDateRange::spanClosed); - } - } - } - return span; - - } - - public void putEntry(WorkerId source, Entry entry) { - synchronized (this) { - entries.put(source, entry); - span = null; - numberOfEntities = -1L; - numberOfEvents = -1L; - } - } - - @Data - @NoArgsConstructor - @AllArgsConstructor - public static class Entry { - private long numberOfEvents; - - @JsonIgnore - private final Set foundEntities = new HashSet<>(); - private long numberOfEntities; - private int minDate = Integer.MAX_VALUE; - private int maxDate = Integer.MIN_VALUE; - - @JsonIgnore - public CDateRange getSpan() { - if(minDate == Integer.MAX_VALUE && maxDate == Integer.MIN_VALUE) { - return null; - } - - return CDateRange.of( - minDate == Integer.MAX_VALUE ? Integer.MIN_VALUE : minDate, - maxDate == Integer.MIN_VALUE ? Integer.MAX_VALUE : maxDate - ); - } - - public void addEvent(Table table, Bucket bucket, int event, String entityForEvent) { - numberOfEvents++; - if (foundEntities.add(entityForEvent)) { - numberOfEntities++; - } - - for (Column c : table.getColumns()) { - if (!c.getType().isDateCompatible()) { - continue; - } - - if (!bucket.has(event, c)) { - continue; - } - - final CDateRange time = bucket.getAsDateRange(event, c); - - if (time.hasUpperBound()){ - maxDate = Math.max(time.getMaxValue(), maxDate); - } - - if (time.hasLowerBound()){ - minDate = Math.min(time.getMinValue(), minDate); - } - } - } - } + @Nullable + CDateRange spanEvents(); } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/tree/ConceptTreeNode.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/tree/ConceptTreeNode.java index 75f5d9d532..6c6bd91b62 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/tree/ConceptTreeNode.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/tree/ConceptTreeNode.java @@ -3,7 +3,6 @@ import java.util.List; import com.bakdata.conquery.models.datasets.concepts.ConceptElement; -import com.bakdata.conquery.models.datasets.concepts.MatchingStats; import com.bakdata.conquery.models.identifiable.Named; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptElementId; import com.fasterxml.jackson.annotation.JsonBackReference; @@ -14,16 +13,20 @@ public interface ConceptTreeNode getChildren(); + int getLocalId(); + int getDepth(); + @JsonIgnore int[] getPrefix(); + @JsonBackReference ConceptTreeNode getParent(); - void setLocalId(int size); - MatchingStats getMatchingStats(); + void setLocalId(int size); String getDescription(); + String getLabel(); } diff --git a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateElementMatchingStats.java b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateElementMatchingStats.java index 9d5821b198..10ff3a2979 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateElementMatchingStats.java +++ b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateElementMatchingStats.java @@ -5,9 +5,9 @@ import java.util.Map.Entry; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.mode.cluster.WorkerMatchingStats; import com.bakdata.conquery.models.datasets.concepts.Concept; import com.bakdata.conquery.models.datasets.concepts.ConceptElement; -import com.bakdata.conquery.models.datasets.concepts.MatchingStats; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptElementId; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; import com.bakdata.conquery.models.identifiable.ids.specific.WorkerId; @@ -26,17 +26,18 @@ @Getter @ToString public class UpdateElementMatchingStats extends NamespaceMessage { + private final WorkerId source; @ToString.Exclude - private final Map, MatchingStats.Entry> values; + private final Map, WorkerMatchingStats.Entry> values; @Override public void react(DistributedNamespace context) throws Exception { // We collect the concepts outside the loop to update the storage afterward Map> conceptsToUpdate = new HashMap<>(); - for (Entry, MatchingStats.Entry> entry : values.entrySet()) { + for (Entry, WorkerMatchingStats.Entry> entry : values.entrySet()) { try { ConceptElementId element = entry.getKey(); ConceptId conceptId = element.findConcept(); @@ -47,17 +48,18 @@ public void react(DistributedNamespace context) throws Exception { final ConceptElement target = concept.findById(element); - final MatchingStats.Entry value = entry.getValue(); + final WorkerMatchingStats.Entry value = entry.getValue(); conceptsToUpdate.put(conceptId, concept); - MatchingStats matchingStats = target.getMatchingStats(); + WorkerMatchingStats matchingStats = (WorkerMatchingStats) target.getMatchingStats(); if (matchingStats == null) { - matchingStats = new MatchingStats(); + matchingStats = new WorkerMatchingStats(); target.setMatchingStats(matchingStats); } matchingStats.putEntry(source, value); - } catch (Exception e) { + } + catch (Exception e) { log.error("Failed to set matching stats for '{}' (enable TRACE for exception)", entry.getKey(), (Exception) (log.isTraceEnabled() ? e : null)); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java index 19f54c8d13..82b9d66474 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java +++ b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java @@ -1,33 +1,14 @@ package com.bakdata.conquery.models.messages.namespaces.specific; import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; -import java.util.function.Predicate; -import java.util.stream.Collectors; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.Table; -import com.bakdata.conquery.models.datasets.concepts.Concept; -import com.bakdata.conquery.models.datasets.concepts.ConceptElement; -import com.bakdata.conquery.models.datasets.concepts.Connector; -import com.bakdata.conquery.models.datasets.concepts.MatchingStats; -import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; -import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; -import com.bakdata.conquery.models.events.Bucket; -import com.bakdata.conquery.models.events.CBlock; -import com.bakdata.conquery.models.identifiable.ids.specific.ConceptElementId; +import com.bakdata.conquery.mode.cluster.WorkerUpdateMatchingStatsJob; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; -import com.bakdata.conquery.models.jobs.Job; import com.bakdata.conquery.models.messages.namespaces.NamespacedMessage; import com.bakdata.conquery.models.messages.namespaces.WorkerMessage; import com.bakdata.conquery.models.worker.Worker; -import com.bakdata.conquery.util.progressreporter.ProgressReporter; import com.fasterxml.jackson.annotation.JsonCreator; -import com.google.common.base.Functions; import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -46,135 +27,6 @@ public class UpdateMatchingStatsMessage extends WorkerMessage { @Override public void react(Worker worker) throws Exception { - worker.getJobManager().addSlowJob(new UpdateMatchingStatsJob(worker, concepts)); + worker.getJobManager().addSlowJob(new WorkerUpdateMatchingStatsJob(worker, concepts)); } - - @RequiredArgsConstructor - private static class UpdateMatchingStatsJob extends Job { - private final Worker worker; - private final Collection concepts; - - @Override - public void execute() throws Exception { - if (worker.getStorage().getAllCBlocks().findAny().isEmpty()) { - log.debug("Worker {} is empty, skipping.", worker); - return; - } - - final ProgressReporter progressReporter = getProgressReporter(); - progressReporter.setMax(concepts.size()); - - log.info("BEGIN update Matching stats for {} Concepts", concepts.size()); - - final Map> - subJobs = - concepts.stream() - .collect(Collectors.toMap(Functions.identity(), - concept -> CompletableFuture.runAsync(() -> { - final Concept resolved = concept.resolve(); - final Map, MatchingStats.Entry> - matchingStats = - new HashMap<>(resolved.countElements()); - - calculateConceptMatches(resolved, matchingStats, worker); - - worker.send(new UpdateElementMatchingStats(worker.getInfo().getId(), matchingStats)); - - progressReporter.report(1); - }, worker.getJobsExecutorService()) - )); - - - log.debug("All jobs submitted. Waiting for completion."); - - - final CompletableFuture all = CompletableFuture.allOf(subJobs.values().toArray(CompletableFuture[]::new)); - - do { - try { - all.get(1, TimeUnit.MINUTES); - } - catch (TimeoutException exception) { - // Count unfinished matching stats jobs. - if (log.isDebugEnabled()) { - final long unfinished = subJobs.values().stream().filter(Predicate.not(CompletableFuture::isDone)).count(); - log.debug("{} still waiting for {} tasks", worker.getInfo().getDataset(), unfinished); - } - - // When trace, also log the unfinished jobs. - if (log.isTraceEnabled()) { - subJobs.forEach((concept, future) -> { - if (future.isDone()) { - return; - } - - log.trace("Still waiting for `{}`", concept); - - }); - } - } - } while (!all.isDone()); - - log.debug("DONE collecting matching stats for {}", worker.getInfo().getDataset()); - - } - - @Override - public String getLabel() { - return String.format("Calculate Matching Stats for %s", worker.getInfo().getDataset()); - } - - private static void calculateConceptMatches(Concept concept, Map, MatchingStats.Entry> results, Worker worker) { - log.debug("BEGIN calculating for `{}`", concept.getId()); - - for (CBlock cBlock : worker.getStorage().getAllCBlocks().toList()) { - - if (!cBlock.getConnector().getConcept().equals(concept.getId())) { - continue; - } - - try { - final Bucket bucket = cBlock.getBucket().resolve(); - final Table table = bucket.getTable().resolve(); - - for (String entity : bucket.entities()) { - - final int entityEnd = bucket.getEntityEnd(entity); - - for (int event = bucket.getEntityStart(entity); event < entityEnd; event++) { - - final int[] localIds = cBlock.getPathToMostSpecificChild(event); - - - if (!(concept instanceof TreeConcept) || localIds == null) { - results.computeIfAbsent(concept.getId(), (ignored) -> new MatchingStats.Entry()).addEvent(table, bucket, event, entity); - continue; - } - - if (Connector.isNotContained(localIds)) { - continue; - } - - ConceptTreeNode element = ((TreeConcept) concept).getElementByLocalIdPath(localIds); - - while (element != null) { - results.computeIfAbsent(((ConceptElement) element).getId(), (ignored) -> new MatchingStats.Entry()) - .addEvent(table, bucket, event, entity); - element = element.getParent(); - } - } - } - - } - catch (Exception e) { - log.error("Failed to collect the matching stats for {}", cBlock, e); - } - } - - log.trace("DONE calculating for `{}`", concept.getId()); - } - - } - - } diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java index 58d00e6580..3dbdf83adb 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java @@ -3,6 +3,8 @@ import java.io.IOException; import java.util.List; import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -10,11 +12,18 @@ import com.bakdata.conquery.io.storage.NamespaceStorage; import com.bakdata.conquery.mode.local.SqlEntityResolver; import com.bakdata.conquery.mode.local.SqlStorageHandler; +import com.bakdata.conquery.mode.local.SqlUpdateMatchingStatsJob; +import com.bakdata.conquery.models.config.DatabaseConfig; +import com.bakdata.conquery.models.config.SqlConnectorConfig; import com.bakdata.conquery.models.datasets.Column; +import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; +import com.bakdata.conquery.models.jobs.Job; import com.bakdata.conquery.models.jobs.JobManager; import com.bakdata.conquery.models.query.ExecutionManager; import com.bakdata.conquery.models.query.FilterSearch; import com.bakdata.conquery.sql.DSLContextWrapper; +import com.bakdata.conquery.sql.conversion.dialect.SqlDialect; +import com.bakdata.conquery.sql.execution.SqlExecutionService; import com.fasterxml.jackson.databind.ObjectMapper; import lombok.Getter; import lombok.extern.slf4j.Slf4j; @@ -23,12 +32,20 @@ @Slf4j public class LocalNamespace extends Namespace { + private final SqlConnectorConfig sqlConnectorConfig; + private final DatabaseConfig databaseConfig; + private final SqlDialect sqlDialect; + private final SqlExecutionService sqlExecutionService; private final DSLContextWrapper dslContextWrapper; private final SqlStorageHandler storageHandler; public LocalNamespace( ObjectMapper preprocessMapper, NamespaceStorage storage, + SqlConnectorConfig sqlConnectorConfig, + DatabaseConfig databaseConfig, + SqlDialect sqlDialect, + SqlExecutionService sqlExecutionService, ExecutionManager executionManager, DSLContextWrapper dslContextWrapper, SqlStorageHandler storageHandler, @@ -38,13 +55,26 @@ public LocalNamespace( List injectables ) { super(preprocessMapper, storage, executionManager, jobManager, filterSearch, sqlEntityResolver, injectables); + this.sqlConnectorConfig = sqlConnectorConfig; + this.databaseConfig = databaseConfig; + this.sqlDialect = sqlDialect; + this.sqlExecutionService = sqlExecutionService; this.dslContextWrapper = dslContextWrapper; this.storageHandler = storageHandler; } @Override void updateMatchingStats() { - // TODO Build basic statistic on data + final Set concepts = collectConcepts(); + ExecutorService executorService = Executors.newFixedThreadPool(sqlConnectorConfig.getBackgroundThreads()); + Job job = new SqlUpdateMatchingStatsJob( + databaseConfig, + sqlExecutionService, + sqlDialect.getFunctionProvider(), + concepts, + executorService + ); + getJobManager().addSlowJob(job); } @Override diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java index 0e04fa8e81..7f112044de 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java @@ -4,6 +4,7 @@ import java.util.Collection; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import com.bakdata.conquery.apiv1.query.concept.specific.external.EntityResolver; import com.bakdata.conquery.io.jackson.Injectable; @@ -11,8 +12,10 @@ import com.bakdata.conquery.models.datasets.Column; import com.bakdata.conquery.models.datasets.Dataset; import com.bakdata.conquery.models.datasets.PreviewConfig; +import com.bakdata.conquery.models.datasets.concepts.Concept; import com.bakdata.conquery.models.datasets.concepts.Searchable; import com.bakdata.conquery.models.datasets.concepts.select.connector.specific.MappableSingleColumnSelect; +import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; import com.bakdata.conquery.models.jobs.JobManager; import com.bakdata.conquery.models.jobs.SimpleJob; import com.bakdata.conquery.models.jobs.UpdateFilterSearchJob; @@ -54,14 +57,16 @@ public Dataset getDataset() { public void close() { try { jobManager.close(); - } catch (Exception e) { + } + catch (Exception e) { log.error("Unable to close namespace jobmanager of {}", this, e); } try { log.info("Closing namespace storage of {}", getStorage().getDataset().getId()); storage.close(); - } catch (IOException e) { + } + catch (IOException e) { log.error("Unable to close namespace storage of {}.", this, e); } } @@ -69,7 +74,8 @@ public void close() { public void remove() { try { jobManager.close(); - } catch (Exception e) { + } + catch (Exception e) { log.error("Unable to close namespace jobmanager of {}", this, e); } @@ -135,4 +141,11 @@ public void postprocessData() { )); } + + protected Set collectConcepts() { + return getStorage().getAllConcepts() + .filter(concept -> concept.getMatchingStats() == null) + .map(Concept::getId) + .collect(Collectors.toSet()); + } } diff --git a/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/PostgreSqlFunctionProvider.java b/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/PostgreSqlFunctionProvider.java index 8b1d166155..0201bd8c73 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/PostgreSqlFunctionProvider.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/PostgreSqlFunctionProvider.java @@ -36,6 +36,8 @@ private static final String INFINITY_DATE_VALUE = "infinity"; private static final String MINUS_INFINITY_DATE_VALUE = "-infinity"; private static final String ANY_CHAR_REGEX = "%"; + private static final String RANGE_EXCLUSIVE_END = "[)"; + private static final String RANGE_INCLUSIVE_END = "[]"; @Override public String getMaxDateExpression() { @@ -90,7 +92,7 @@ public ColumnDateRange forCDateRange(CDateRange daterange) { Object.class, DSL.val(startDateExpression), DSL.val(endDateExpression), - DSL.val("[]") + DSL.val(RANGE_INCLUSIVE_END) ); return ColumnDateRange.of(daterangeField); @@ -176,10 +178,14 @@ public Field daterangeStringAggregation(ColumnDateRange columnDateRange) @Override public Field daterangeStringExpression(ColumnDateRange columnDateRange) { + Field daterange; if (!columnDateRange.isSingleColumnRange()) { - throw new UnsupportedOperationException("All column date ranges should have been converted to single column ranges."); + daterange = daterange(columnDateRange.getStart(), columnDateRange.getEnd(), RANGE_EXCLUSIVE_END); } - Field aggregatedValidityDate = DSL.field("({0})::{1}", String.class, columnDateRange.getRange(), DSL.keyword("varchar")); + else { + daterange = columnDateRange.getRange(); + } + Field aggregatedValidityDate = DSL.field("({0})::{1}", String.class, daterange, DSL.keyword("varchar")); return replace(aggregatedValidityDate, INFINITY_DATE_VALUE, INFINITY_SIGN); } @@ -303,7 +309,7 @@ private ColumnDateRange toColumnDateRange(CDateRange dateRestriction) { Object.class, toDateField(startDateExpression), toDateField(endDateExpression), - DSL.val("[]") + DSL.val(RANGE_INCLUSIVE_END) ); return ColumnDateRange.of(dateRestrictionRange); @@ -333,14 +339,14 @@ private ColumnDateRange ofSingleColumn(String tableName, Column column) { DSL.function("upper", Date.class, daterange), toDateField(INFINITY_DATE_VALUE) ); - yield daterange(startColumn, endColumn, "[]"); + yield daterange(startColumn, endColumn, RANGE_INCLUSIVE_END); } // if the validity date column is not of daterange type, we construct it manually case DATE -> { Field singleDate = DSL.field(DSL.name(tableName, column.getName()), Date.class); Field startColumn = DSL.coalesce(singleDate, toDateField(MINUS_INFINITY_DATE_VALUE)); Field endColumn = DSL.coalesce(singleDate, toDateField(INFINITY_DATE_VALUE)); - yield daterange(startColumn, endColumn, "[]"); + yield daterange(startColumn, endColumn, RANGE_INCLUSIVE_END); } default -> throw new IllegalArgumentException( "Given column type '%s' can't be converted to a proper date restriction.".formatted(column.getType()) @@ -361,13 +367,13 @@ private ColumnDateRange ofStartAndEnd(String tableName, Column startColumn, Colu toDateField(INFINITY_DATE_VALUE) ); - return ColumnDateRange.of(daterange(start, end, "[]")); + return ColumnDateRange.of(daterange(start, end, RANGE_INCLUSIVE_END)); } private ColumnDateRange ensureIsSingleColumnRange(ColumnDateRange daterange) { return daterange.isSingleColumnRange() ? daterange - : ColumnDateRange.of(daterange(daterange.getStart(), daterange.getEnd(), "[)")); // end is already exclusive + : ColumnDateRange.of(daterange(daterange.getStart(), daterange.getEnd(), RANGE_EXCLUSIVE_END)); // end is already exclusive } } diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java index 29b7fca7dc..a354a92f10 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java @@ -12,6 +12,11 @@ public class DefaultSqlCDateSetParser implements SqlCDateSetParser { + /** + * Postgres daterange function creates this expression when called with null-arguments instead of null. + */ + public static final String POSTGRES_NULL_RANGE = "(,)"; + public static final String EMPTY_RANGE_BRACES = "{}"; public static final String DATE_SEPARATOR = ","; public static final char INCLUDED_START_CHAR = '['; @@ -37,7 +42,7 @@ public List> toEpochDayRangeList(String multiDateRange) { @Override public List toEpochDayRange(String daterange) { - if (daterange == null) { + if (daterange == null || daterange.equals(POSTGRES_NULL_RANGE)) { return Collections.emptyList(); } diff --git a/backend/src/test/java/com/bakdata/conquery/integration/sql/dialect/TestSqlConnectorConfig.java b/backend/src/test/java/com/bakdata/conquery/integration/sql/dialect/TestSqlConnectorConfig.java index bf3c1a5325..ae02f110ec 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/sql/dialect/TestSqlConnectorConfig.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/sql/dialect/TestSqlConnectorConfig.java @@ -15,7 +15,7 @@ public class TestSqlConnectorConfig extends SqlConnectorConfig { private static final String TEST_DATASET = "test"; public TestSqlConnectorConfig(DatabaseConfig databaseConfig) { - super(true, true, Map.of(TEST_DATASET, databaseConfig), null); + super(true, true, Runtime.getRuntime().availableProcessors(), Map.of(TEST_DATASET, databaseConfig), null); } @Override diff --git a/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java b/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java index c6573cb181..e76ec7ad19 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java @@ -3,17 +3,16 @@ import static org.assertj.core.api.Assertions.assertThat; import java.time.LocalDate; +import java.util.Set; import com.bakdata.conquery.integration.IntegrationTest; import com.bakdata.conquery.integration.json.ConqueryTestSpec; import com.bakdata.conquery.integration.json.JsonIntegrationTest; import com.bakdata.conquery.models.common.daterange.CDateRange; -import com.bakdata.conquery.models.datasets.concepts.Concept; +import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeChild; import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; import com.bakdata.conquery.models.exceptions.ValidatorHelper; import com.bakdata.conquery.models.identifiable.ids.specific.DatasetId; -import com.bakdata.conquery.models.messages.namespaces.specific.UpdateMatchingStatsMessage; -import com.bakdata.conquery.models.worker.DistributedNamespace; import com.bakdata.conquery.util.support.StandaloneSupport; import com.github.powerlibraries.io.In; import lombok.extern.slf4j.Slf4j; @@ -21,39 +20,51 @@ @Slf4j public class MetadataCollectionTest extends IntegrationTest.Simple implements ProgrammaticIntegrationTest { + @Override + public Set forModes() { + return Set.of(StandaloneSupport.Mode.WORKER, StandaloneSupport.Mode.SQL); + } + @Override public void execute(StandaloneSupport conquery) throws Exception { - //read test sepcification - String testJson = In.resource("/tests/query/SIMPLE_TREECONCEPT_QUERY/SIMPLE_TREECONCEPT_Query.test.json").withUTF8().readAll(); + //read test specification + String testJson = In.resource("/shared/icd.test.json").withUTF8().readAll(); DatasetId dataset = conquery.getDataset().getId(); - ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson); ValidatorHelper.failOnError(log, conquery.getValidator().validate(test)); - test.importRequiredData(conquery); - //ensure the metadata is collected - DistributedNamespace namespace = (DistributedNamespace) conquery.getNamespace(); - namespace.getWorkerHandler() - .sendToAll(new UpdateMatchingStatsMessage(conquery.getNamespace().getStorage().getAllConcepts().map(Concept::getId).toList())); - + // triggers update matching stats + conquery.getNamespace().postprocessData(); conquery.waitUntilWorkDone(); + TreeConcept concept = (TreeConcept) conquery.getNamespace().getStorage().getAllConcepts().toList().iterator().next(); - TreeConcept concept = (TreeConcept) conquery.getNamespace().getStorage().getAllConcepts().iterator().next(); - - //check the number of matched events - assertThat(concept.getMatchingStats().countEvents()).isEqualTo(4); - assertThat(concept.getChildren()).allSatisfy(c -> { - assertThat(c.getMatchingStats().countEvents()).isEqualTo(2); + //check the number of matched events from root node to the deepest child node + assertThat(concept.getMatchingStats().countEvents()).isEqualTo(10); + assertThat(concept.getMatchingStats().countEntities()).isEqualTo(3); + // concepts 1. child (F00-F99) + ConceptTreeChild f00_99 = concept.getChildren().get(0); + assertThat(f00_99.getMatchingStats().countEvents()).isEqualTo(8); + assertThat(f00_99.getMatchingStats().countEntities()).isEqualTo(3); + // 1. child's child (F20-29) + ConceptTreeChild f20_29 = f00_99.getChildren().get(0); + assertThat(f20_29.getMatchingStats().countEvents()).isEqualTo(7); + assertThat(f20_29.getMatchingStats().countEntities()).isEqualTo(2); + // 1. child's child's child (yeah it's getting wild) + ConceptTreeChild f20 = f20_29.getChildren().get(0); + assertThat(f20.getMatchingStats().countEvents()).isEqualTo(5); + assertThat(f20.getMatchingStats().countEntities()).isEqualTo(1); + // 1. child's child's child's children (I promise it won't get worse) + assertThat(f20.getChildren()).allSatisfy(child -> { + assertThat(child.getMatchingStats().countEvents()).isEqualTo(1); + assertThat(child.getMatchingStats().countEntities()).isEqualTo(1); }); - + //check the date ranges assertThat(concept.getMatchingStats().spanEvents()) - .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-15"), LocalDate.parse("2013-11-10"))); - assertThat(concept.getChildren().get(0).getMatchingStats().spanEvents()) - .isEqualTo(CDateRange.of(LocalDate.parse("2012-01-01"), LocalDate.parse("2013-11-10"))); - assertThat(concept.getChildren().get(1).getMatchingStats().spanEvents()) - .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-15"), LocalDate.parse("2012-11-11"))); + .isEqualTo(CDateRange.of(LocalDate.parse("2009-05-18"), LocalDate.parse("2023-08-20"))); + assertThat(f20.getMatchingStats().spanEvents()) + .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-01"), LocalDate.parse("2023-02-18"))); } } diff --git a/backend/src/test/java/com/bakdata/conquery/models/datasets/concepts/tree/MatchingStatsTests.java b/backend/src/test/java/com/bakdata/conquery/models/datasets/concepts/tree/MatchingStatsTests.java index 8938a83e1c..68e172e5ae 100644 --- a/backend/src/test/java/com/bakdata/conquery/models/datasets/concepts/tree/MatchingStatsTests.java +++ b/backend/src/test/java/com/bakdata/conquery/models/datasets/concepts/tree/MatchingStatsTests.java @@ -2,89 +2,87 @@ import static org.assertj.core.api.Assertions.assertThat; +import com.bakdata.conquery.mode.cluster.WorkerMatchingStats; import com.bakdata.conquery.models.datasets.Column; import com.bakdata.conquery.models.datasets.Table; -import com.bakdata.conquery.models.datasets.concepts.MatchingStats; import com.bakdata.conquery.models.identifiable.ids.specific.DatasetId; import com.bakdata.conquery.models.identifiable.ids.specific.WorkerId; import org.junit.jupiter.api.Test; public class MatchingStatsTests { - private final WorkerId workerId1 = new WorkerId(new DatasetId("sampleDataset"), "sampleWorker"); - private final WorkerId workerId2 = new WorkerId(new DatasetId("sampleDataset2"), "sampleWorker2"); + private final WorkerId workerId1 = new WorkerId(new DatasetId("sampleDataset"), "sampleWorker"); + private final WorkerId workerId2 = new WorkerId(new DatasetId("sampleDataset2"), "sampleWorker2"); - @Test - public void entitiesCountTest() { + @Test + public void entitiesCountTest() { - MatchingStats stats = new MatchingStats(); + WorkerMatchingStats stats = new WorkerMatchingStats(); - assertThat(stats.countEntities()).isEqualTo(0); + assertThat(stats.countEntities()).isEqualTo(0); - stats.putEntry(workerId1, new MatchingStats.Entry(5, 5, 10, 20)); - assertThat(stats.countEntities()).isEqualTo(5); + stats.putEntry(workerId1, new WorkerMatchingStats.Entry(5, 5, 10, 20)); + assertThat(stats.countEntities()).isEqualTo(5); - stats.putEntry(workerId1, new MatchingStats.Entry(5, 8, 10, 20)); - assertThat(stats.countEntities()).isEqualTo(8); + stats.putEntry(workerId1, new WorkerMatchingStats.Entry(5, 8, 10, 20)); + assertThat(stats.countEntities()).isEqualTo(8); - stats.putEntry(workerId2, new MatchingStats.Entry(5, 2, 10, 20)); - assertThat(stats.countEntities()).isEqualTo(10); + stats.putEntry(workerId2, new WorkerMatchingStats.Entry(5, 2, 10, 20)); + assertThat(stats.countEntities()).isEqualTo(10); - } + } - @Test - public void addEventTest(){ - MatchingStats stats = new MatchingStats(); - Table table = new Table(); - table.setColumns(new Column[0]); + @Test + public void addEventTest() { + WorkerMatchingStats stats = new WorkerMatchingStats(); + Table table = new Table(); + table.setColumns(new Column[0]); - assertThat(stats.countEvents()).isEqualTo(0); - assertThat(stats.countEntities()).isEqualTo(0); + assertThat(stats.countEvents()).isEqualTo(0); + assertThat(stats.countEntities()).isEqualTo(0); - MatchingStats.Entry entry1 = new MatchingStats.Entry(); - entry1.addEvent(table, null, 1, "1"); - entry1.addEvent(table, null, 2, "1"); + WorkerMatchingStats.Entry entry1 = new WorkerMatchingStats.Entry(); + entry1.addEvent(table, null, 1, "1"); + entry1.addEvent(table, null, 2, "1"); - entry1.addEvent(table, null, 3, "2"); - entry1.addEvent(table, null, 4, "2"); + entry1.addEvent(table, null, 3, "2"); + entry1.addEvent(table, null, 4, "2"); - entry1.addEvent(table, null, 5, "3"); - entry1.addEvent(table, null, 6, "3"); + entry1.addEvent(table, null, 5, "3"); + entry1.addEvent(table, null, 6, "3"); - entry1.addEvent(table, null, 7, "4"); - entry1.addEvent(table, null, 8, "4"); + entry1.addEvent(table, null, 7, "4"); + entry1.addEvent(table, null, 8, "4"); + stats.putEntry(workerId1, entry1); + assertThat(stats.countEvents()).isEqualTo(8); + assertThat(stats.countEntities()).isEqualTo(4); - stats.putEntry(workerId1, entry1); - assertThat(stats.countEvents()).isEqualTo(8); - assertThat(stats.countEntities()).isEqualTo(4); + WorkerMatchingStats.Entry entry2 = new WorkerMatchingStats.Entry(); - MatchingStats.Entry entry2 = new MatchingStats.Entry(); + entry2.addEvent(table, null, 1, "1"); + entry2.addEvent(table, null, 2, "2"); - entry2.addEvent(table, null, 1, "1"); - entry2.addEvent(table, null, 2, "2"); + entry2.addEvent(table, null, 3, "3"); + entry2.addEvent(table, null, 4, "4"); - entry2.addEvent(table, null, 3, "3"); - entry2.addEvent(table, null, 4, "4"); + entry2.addEvent(table, null, 5, "5"); + entry2.addEvent(table, null, 6, "6"); - entry2.addEvent(table, null, 5, "5"); - entry2.addEvent(table, null, 6, "6"); + entry2.addEvent(table, null, 7, "7"); + entry2.addEvent(table, null, 8, "8"); - entry2.addEvent(table, null, 7, "7"); - entry2.addEvent(table, null, 8, "8"); + entry2.addEvent(table, null, 9, "9"); + entry2.addEvent(table, null, 10, "10"); - entry2.addEvent(table, null, 9, "9"); - entry2.addEvent(table, null, 10, "10"); + stats.putEntry(workerId2, entry2); + assertThat(stats.countEvents()).isEqualTo(18); + assertThat(stats.countEntities()).isEqualTo(14); - stats.putEntry(workerId2, entry2); - assertThat(stats.countEvents()).isEqualTo(18); - assertThat(stats.countEntities()).isEqualTo(14); - - - } + } } diff --git a/backend/src/test/resources/shared/icd.test.json b/backend/src/test/resources/shared/icd.test.json new file mode 100644 index 0000000000..e6b97db334 --- /dev/null +++ b/backend/src/test/resources/shared/icd.test.json @@ -0,0 +1,210 @@ +{ + "type": "QUERY_TEST", + "label": "COMMON_CONCEPT_ICD_QUERY Test", + "expectedCsv": "", + "query": { + "type": "CONCEPT_QUERY", + "root": { + "type": "AND", + "children": [ + { + "type": "DATE_RESTRICTION", + "dateRange": { + "min": "2017-01-01", + "max": "2017-12-31" + }, + "child": { + "type": "CONCEPT", + "ids": [ + "icd.f00$2df99.f20$2df29.f20" + ], + "label": "F20", + "tables": [ + { + "id": "icd.kh_diagnose_icd_code", + "filters": [] + } + ] + } + } + ] + } + }, + "concepts": [ + { + "label": "ICD", + "type": "TREE", + "additionalInfos": [ + { + "key": "ICD-Codes", + "value": "Historisierung bis einschließlich des Jahres 2018" + } + ], + "connectors": [ + { + "label": "KH-Diagnose", + "name": "kh_diagnose_icd_code", + "column": "kh_diagnose.icd_code", + "condition": { + "type": "PREFIX_RANGE", + "min": "E", + "max": "F" + }, + "validityDates": [ + { + "label": "Aufnahmedatum", + "column": "kh_diagnose.aufnahmedatum" + }, + { + "label": "Entlassungsdatum", + "column": "kh_diagnose.entlassungsdatum" + } + ], + "filters": [] + } + ], + "children": [ + { + "label": "F00-F99", + "description": "Psychische und Verhaltensstörungen", + "condition": { + "type": "PREFIX_RANGE", + "min": "F00", + "max": "F99" + }, + "children": [ + { + "label": "F20-F29", + "description": "Schizophrenie, schizotype und wahnhafte Störungen", + "condition": { + "type": "PREFIX_RANGE", + "min": "F20", + "max": "F29" + }, + "children": [ + { + "label": "F20", + "description": "Schizophrenie", + "condition": { + "type": "PREFIX_LIST", + "prefixes": [ + "F20" + ] + }, + "children": [ + { + "label": "F20.0", + "description": "Paranoide Schizophrenie", + "additionalInfos": [ + { + "key": "Stichworte", + "value": "Paranoide Schizophrenie -- Paranoid-halluzinatorische Schizophrenie -- Paranoide Schizophrenie mit Halluzination -- Paraphrenie -- Paranoid-schizophrene Psychose -- Akute Paraphrenie -- Paraphrene Schizophrenie -- Akute paranoide Schizophrenie" + } + ], + "condition": { + "type": "PREFIX_LIST", + "prefixes": [ + "F200" + ] + } + }, + { + "label": "F20.1", + "description": "Hebephrene Schizophrenie", + "additionalInfos": [ + { + "key": "Stichworte", + "value": "Hebephrenie -- Hebephrene Schizophrenie -- Akute Hebephrenie -- Hebephrene Demenz -- Hebephrene Dementia praecox -- Desintegrative Schizophrenie -- Desorganisierte Schizophrenie -- Jugendirresein" + } + ], + "condition": { + "type": "PREFIX_LIST", + "prefixes": [ + "F201" + ] + } + }, + { + "label": "F20.4", + "description": "Postschizophrene Depression", + "additionalInfos": [ + { + "key": "Stichworte", + "value": "Postschizophrene Depression" + } + ], + "condition": { + "type": "PREFIX_LIST", + "prefixes": [ + "F204" + ] + } + }, + { + "label": "F20.5", + "description": "Schizophrenes Residuum", + "additionalInfos": [ + { + "key": "Stichworte", + "value": "Schizophrenes Residuum -- Schizophrener Restzustand -- Chronischer Morbus Bleuler -- Schizophrener Defekt -- Chronische Schizophrenie a.n.k. -- Residuale Schizophrenie -- Schizophrener Residualzustand -- Chronische undifferenzierte Schizophrenie" + } + ], + "condition": { + "type": "PREFIX_LIST", + "prefixes": [ + "F205" + ] + } + }, + { + "label": "F20.6", + "description": "Schizophrenia simplex", + "additionalInfos": [ + { + "key": "Stichworte", + "value": "Schizophrenia simplex -- Akute primäre Schizophrenie -- Akute einfache Schizophrenie" + } + ], + "condition": { + "type": "PREFIX_LIST", + "prefixes": [ + "F206" + ] + } + } + ] + } + ] + } + ] + } + ] + } + ], + "content": { + "tables": [ + { + "csv": "/shared/kh-content.csv", + "name": "kh_diagnose", + "primaryColumn": { + "name": "primary_column", + "type": "STRING" + }, + "columns": [ + { + "name": "icd_code", + "type": "STRING" + }, + { + "name": "aufnahmedatum", + "type": "DATE" + }, + { + "name": "entlassungsdatum", + "type": "DATE" + } + ] + } + ] + } +} diff --git a/backend/src/test/resources/shared/kh-content.csv b/backend/src/test/resources/shared/kh-content.csv new file mode 100644 index 0000000000..679471fb56 --- /dev/null +++ b/backend/src/test/resources/shared/kh-content.csv @@ -0,0 +1,11 @@ +primary_column,icd_code,aufnahmedatum,entlassungsdatum +3,"F200",2022-11-28,2022-11-11 +3,"F201",2021-08-31,2021-12-15 +3,"F204",2010-07-01,2019-07-13 +3,"F205",2023-02-06,2023-02-18 +3,"F206",2021-10-22,2021-11-06 +10,"F21",2014-04-18,2022-06-29 +10,"F22",2016-12-15,2018-11-28 +15,"F3",2017-12-08,2019-09-23 +15,"F31",2022-03-22,2023-08-20 +15,"E66",2009-05-18,2021-11-06 From f5955d5d0b2a87f66f91f34d9b87456579c11714 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 16 Oct 2024 21:59:42 +0200 Subject: [PATCH 02/31] Submit matchings stats collection tasks in parallel --- .../bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index 8833b6a1ea..2dca7c56bb 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -93,6 +93,7 @@ public void execute() throws Exception { log.debug("BEGIN update Matching stats for {} Concepts.", concepts.size()); concepts.stream() + .parallel() .map(ConceptId::resolve) .filter(SqlUpdateMatchingStatsJob::isTreeConcept) .flatMap(concept -> collectMatchingStats(concept.getConnectors(), (TreeConcept) concept)) From 467abcc56bd7bbdc11dbe41a7462b21aff139be3 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Thu, 17 Oct 2024 15:39:00 +0200 Subject: [PATCH 03/31] Track time while calculating matching stats --- .../conquery/mode/local/SqlUpdateMatchingStatsJob.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index 2dca7c56bb..4ae932d1cd 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -92,6 +92,7 @@ public void execute() throws Exception { log.debug("BEGIN update Matching stats for {} Concepts.", concepts.size()); + long startTime = System.currentTimeMillis(); concepts.stream() .parallel() .map(ConceptId::resolve) @@ -105,7 +106,8 @@ public void execute() throws Exception { log.debug("Waiting for executors to set matching stats for all concepts..."); } - log.debug("DONE collecting matching stats."); + long timeElapsed = System.currentTimeMillis() - startTime; + log.debug("DONE collecting matching stats. Elapsed time: {} ms", timeElapsed); } @Override From 63a5832bc2a922ed1232509ab282b59f53226aa9 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Thu, 17 Oct 2024 18:17:56 +0200 Subject: [PATCH 04/31] Deactivate mappings and filter job --- .../java/com/bakdata/conquery/models/worker/Namespace.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java index 7f112044de..327e06f5d6 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java @@ -134,9 +134,9 @@ public void postprocessData() { getJobManager().addSlowJob(new SimpleJob( "Initiate Update Matching Stats and FilterSearch", () -> { - updateInternToExternMappings(); + // updateInternToExternMappings(); updateMatchingStats(); - updateFilterSearch(); + // updateFilterSearch(); } )); From 2e130baa2b51d3fb58703f04cf69d91b9ea64426 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Fri, 25 Oct 2024 08:51:10 +0200 Subject: [PATCH 05/31] Draft for fast matching stats calculation of only-equal-condition trees --- .../conquery/mode/local/SqlMatchingStats.java | 63 ++- .../mode/local/SqlUpdateMatchingStatsJob.java | 497 ++++++++++++------ .../dialect/HanaSqlFunctionProvider.java | 8 +- .../dialect/PostgreSqlFunctionProvider.java | 6 + .../dialect/SqlFunctionProvider.java | 5 + .../execution/DefaultResultSetProcessor.java | 7 +- .../execution/DefaultSqlCDateSetParser.java | 2 +- .../sql/execution/ResultSetProcessor.java | 2 + 8 files changed, 418 insertions(+), 172 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java index a47199712b..e370dc7963 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java @@ -1,15 +1,21 @@ package com.bakdata.conquery.mode.local; +import java.time.LocalDate; + import com.bakdata.conquery.models.common.daterange.CDateRange; import com.bakdata.conquery.models.datasets.concepts.MatchingStats; -import lombok.Value; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; -@Value +@Data +@NoArgsConstructor +@AllArgsConstructor public class SqlMatchingStats implements MatchingStats { - long numberOfEvents; - long numberOfEntities; - CDateRange span; + private long numberOfEvents; + private long numberOfEntities; + private CDateRange span; @Override public long countEvents() { @@ -26,4 +32,51 @@ public CDateRange spanEvents() { return span; } + public SqlMatchingStats add(SqlMatchingStats other) { + + this.numberOfEvents += other.numberOfEvents; + this.numberOfEntities += other.numberOfEntities; + + if ((this.span == null && other.span == null) || this.span != null && other.span == null) { + return this; + } + else if (this.span == null) { + this.span = other.span; + return this; + } + else { + + final LocalDate thisMin = this.span.getMin(); + final LocalDate otherMin = other.getSpan().getMin(); + final LocalDate min; + + if (thisMin == null) { + min = otherMin; + } + else if (otherMin == null) { + min = thisMin; + } + else { + min = thisMin.isBefore(otherMin) ? thisMin : otherMin; + } + + final LocalDate thisMax = this.span.getMax(); + final LocalDate otherMax = other.getSpan().getMax(); + final LocalDate max; + + if (thisMax == null) { + max = otherMax; + } + else if (otherMax == null) { + max = thisMax; + } + else { + max = thisMax.isAfter(otherMax) ? thisMax : otherMax; + } + + this.span = CDateRange.of(min, max); + return this; + } + } + } diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index 4ae932d1cd..87127ebd3e 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -3,7 +3,8 @@ import static org.jooq.impl.DSL.*; import java.math.BigDecimal; -import java.sql.ResultSet; +import java.sql.Date; +import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; @@ -15,6 +16,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.BinaryOperator; import java.util.function.Function; import java.util.stream.Collectors; @@ -26,7 +28,10 @@ import com.bakdata.conquery.models.datasets.concepts.ConceptElement; import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.conditions.CTCondition; +import com.bakdata.conquery.models.datasets.concepts.conditions.EqualCondition; +import com.bakdata.conquery.models.datasets.concepts.conditions.PrefixCondition; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeChild; +import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeConnector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; @@ -42,6 +47,7 @@ import org.jooq.Condition; import org.jooq.DSLContext; import org.jooq.Field; +import org.jooq.Name; import org.jooq.Record; import org.jooq.Record1; import org.jooq.Result; @@ -53,11 +59,12 @@ @Slf4j public class SqlUpdateMatchingStatsJob extends Job { - private static final String EVENTS_FIELD = "events"; + private static final Name CONNECTOR_COLUMN = name("connector_column"); + private static final Name EVENTS = name("events"); private static final String EVENTS_TABLE = "events_unioned"; - private static final String PRIMARY_COLUMN_ALIAS = SharedAliases.PRIMARY_COLUMN.getAlias(); + private static final Name ENTITIES = name("entities"); private static final String ENTITIES_TABLE = "entities"; - private static final String VALIDITY_DATE_SELECT = "unioned"; + private static final Name DATES = name("unioned"); private static final String VALIDITY_DATES_TABLE = "validity_dates"; private final DatabaseConfig databaseConfig; @@ -92,22 +99,51 @@ public void execute() throws Exception { log.debug("BEGIN update Matching stats for {} Concepts.", concepts.size()); - long startTime = System.currentTimeMillis(); + final List regularApproach = new ArrayList<>(); + final List onlyEqualConditions = new ArrayList<>(); + final List prefixConcept = new ArrayList<>(); + concepts.stream() .parallel() .map(ConceptId::resolve) .filter(SqlUpdateMatchingStatsJob::isTreeConcept) - .flatMap(concept -> collectMatchingStats(concept.getConnectors(), (TreeConcept) concept)) - .map(executors::submit) - .forEach(SqlUpdateMatchingStatsJob::checkForError); + .forEach(concept -> { + final TreeConcept treeConcept = (TreeConcept) concept; + if (treeConcept.getChildren().isEmpty()) { + regularApproach.add(treeConcept); + } + else if (anyConditionMatches(treeConcept.getChildren(), PrefixCondition.class)) { + prefixConcept.add(treeConcept); + } + else if (allConditionsMatch(treeConcept.getChildren(), EqualCondition.class)) { + onlyEqualConditions.add(treeConcept); + } + else { + regularApproach.add(treeConcept); + } + }); + + log.info("Skipping matching stats calc for prefix concepts: {}", prefixConcept.stream().map(Concept::getName).toList()); + + final long startTime = System.currentTimeMillis(); + final List> runningQueries = + Stream.concat( + regularApproach.stream().flatMap(concept -> walkAndCollectMatchingStats(concept.getConnectors(), concept)), + onlyEqualConditions.stream().map(AllEqualConditionsTask::new) + ) + .parallel() + .map(executors::submit) + .toList(); executors.shutdown(); while (!executors.awaitTermination(1, TimeUnit.MINUTES)) { log.debug("Waiting for executors to set matching stats for all concepts..."); } - long timeElapsed = System.currentTimeMillis() - startTime; - log.debug("DONE collecting matching stats. Elapsed time: {} ms", timeElapsed); + final long timeElapsed = System.currentTimeMillis() - startTime; + log.info("DONE collecting matching stats. Elapsed time: {} ms", timeElapsed); + + runningQueries.forEach(SqlUpdateMatchingStatsJob::checkForError); } @Override @@ -116,16 +152,7 @@ public void cancel() { executors.shutdownNow(); } - private static void checkForError(Future future) { - try { - future.get(); - } - catch (ExecutionException | InterruptedException e) { - log.error("Unknown error while querying SQL matching stats. Cause: \n", e.getCause()); - } - } - - private static boolean isTreeConcept(Concept concept) { + private static boolean isTreeConcept(final Concept concept) { if (!(concept instanceof TreeConcept)) { log.error("Collecting MatchingStats is currently only supported for TreeConcepts."); return false; @@ -133,187 +160,333 @@ private static boolean isTreeConcept(Concept concept) { return true; } - private Stream> collectMatchingStats(List connectors, ConceptTreeNode treeNode) { - return Stream.concat( - treeNode.getChildren().stream().flatMap(child -> collectMatchingStats(connectors, child)), - Stream.of(new SqlMatchingStatsTask(connectors, (ConceptElement) treeNode)) - ); + private boolean allConditionsMatch(final List children, final Class condition) { + return children.stream().allMatch(child -> { + if (child.getChildren().isEmpty()) { + return condition.isInstance(child.getCondition()); + } + if (child.getCondition() != null && !condition.isInstance(child.getCondition())) { + return false; + } + return allConditionsMatch(child.getChildren(), condition); + }); } - /** - * Applies a count(*) on each connector's table, unions these tables and finally calculates the sum() of the count per connector - * to obtain the concept's total event count. - */ - private long collectEventCount(List connectors, Optional childCondition) { - - org.jooq.Table> eventsUnioned = - union(connectors, connector -> createCountEventsQuery(connector, childCondition), Select::unionAll, EVENTS_TABLE); - - SelectJoinStep> eventsQuery = dslContext.select(sum(eventsUnioned.field(EVENTS_FIELD, BigDecimal.class)).as(EVENTS_FIELD)) - .from(eventsUnioned); + private boolean anyConditionMatches(final List children, final Class condition) { + return children.stream().anyMatch(child -> { + if (child.getChildren().isEmpty()) { + return condition.isInstance(child.getCondition()); + } + if (child.getCondition() != null && condition.isInstance(child.getCondition())) { + return true; + } + return anyConditionMatches(child.getChildren(), condition); + }); + } - Result result = executionService.fetch(eventsQuery); + private static void checkForError(final Future future) { try { - BigDecimal events = (BigDecimal) result.getValue(0, EVENTS_FIELD); - return Objects.requireNonNull(events).longValue(); + future.get(); } - catch (Exception e) { - log.error("Expecting exactly 1 column of numeric type and 1 row in Result when querying for events of a concept node. Error: ", e); - return 0; + catch (ExecutionException | InterruptedException e) { + log.error("Unknown error while querying SQL matching stats. Cause: \n", e.getCause()); } } - private SelectConditionStep> createCountEventsQuery(Connector connector, Optional childCondition) { - return dslContext.select(count().as(EVENTS_FIELD)) - .from(table(name(connector.getResolvedTable().getName()))) - .where(toJooqCondition(connector, childCondition)); + private Stream walkAndCollectMatchingStats(final List connectors, final ConceptTreeNode treeNode) { + return Stream.concat( + treeNode.getChildren().stream().flatMap(child -> walkAndCollectMatchingStats(connectors, child)), + Stream.of(new RegularTask(connectors, (ConceptElement) treeNode)) + ); } - /** - * Selects the PIDs for each connector, unions these tables and does a countDistinct(pid) to obtain the concepts total entity count. - */ - private long collectEntityCount(List connectors, Optional childCondition) { + private CDateRange toDateRange(final String validityDateExpression) { + final List dateRange = executionService.getResultSetProcessor().getCDateSetParser().toEpochDayRange(validityDateExpression); + return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : null; + } - org.jooq.Table> entitiesUnioned = - union(connectors, connector -> createCountEntitiesQuery(connector, childCondition), Select::union, ENTITIES_TABLE); + @RequiredArgsConstructor + private class RegularTask implements Callable { - SelectJoinStep> entitiesQuery = - dslContext.select(countDistinct(entitiesUnioned.field(PRIMARY_COLUMN_ALIAS)).as(PRIMARY_COLUMN_ALIAS)) - .from(entitiesUnioned); + private final List connectors; + private final ConceptElement treeNode; - Result result = executionService.fetch(entitiesQuery); - try { - // we will get an Integer as SQL return type of SUM select, but MatchingStats expect a long - Integer value = (Integer) result.getValue(0, PRIMARY_COLUMN_ALIAS); - return Objects.requireNonNull(value).longValue(); + @Override + public Void call() { + final Optional childCondition = treeNode instanceof ConceptTreeChild treeChild + ? Optional.of(treeChild.getCondition()) + : Optional.empty(); + + final long events = collectEventCount(connectors, childCondition); + final long entities = collectEntityCount(connectors, childCondition); + final CDateRange span = collectDateSpan(connectors, childCondition); + + final SqlMatchingStats matchingStats = new SqlMatchingStats(events, entities, span); + treeNode.setMatchingStats(matchingStats); + + return null; } - catch (Exception e) { - log.error("Expecting exactly 1 column of type Integer and 1 row in Result when querying for events of a concept node. Error: ", e); - return 0; + + /** + * Applies a count(*) on each connector's table, unions these tables and finally calculates the sum() of the count per connector + * to obtain the concept's total event count. + */ + private long collectEventCount(final List connectors, final Optional childCondition) { + + final org.jooq.Table> eventsUnioned = + union(connectors, connector -> createCountEventsQuery(connector, childCondition), Select::unionAll, EVENTS_TABLE); + + final SelectJoinStep> eventsQuery = dslContext.select(sum(eventsUnioned.field(EVENTS, BigDecimal.class)).as(EVENTS)) + .from(eventsUnioned); + + final Result result = executionService.fetch(eventsQuery); + try { + final BigDecimal events = (BigDecimal) result.getValue(0, field(EVENTS)); + return Objects.requireNonNull(events).longValue(); + } + catch (Exception e) { + log.error("Expecting exactly 1 column of numeric type and 1 row in Result when querying for events of a concept node. Error: ", e); + return 0; + } } - } - private SelectConditionStep> createCountEntitiesQuery(Connector connector, Optional childCondition) { - Field primaryColumn = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(PRIMARY_COLUMN_ALIAS); - Table connectorTable = table(name(connector.getResolvedTable().getName())); - Condition connectorCondition = toJooqCondition(connector, childCondition); - return dslContext.select(primaryColumn) - .from(connectorTable) - .where(connectorCondition); - } + private SelectConditionStep> createCountEventsQuery(final Connector connector, final Optional childCondition) { + return dslContext.select(count().as(EVENTS)) + .from(table(name(connector.getResolvedTable().getName()))) + .where(toJooqCondition(connector, childCondition)); + } - /** - * For each connector and each of its validity dates, we select the start and end date, union all these tables and select the min(start) and max(end) - * to obtain the concepts total date span. - * - * @return A {@link CDateRange} with the min and max validity date over all the given connectors. Null, if the given connectors have no validity date at all. - */ - private CDateRange collectDateSpan(List connectors, Optional childCondition) { - - Map> validityDateMap = connectors.stream().collect( - // we create all validity dates with the same alias to union them later - Collectors.toMap(Function.identity(), connector -> createColumnDateRanges(connector, VALIDITY_DATE_SELECT)) - ); - if (validityDateMap.values().stream().allMatch(List::isEmpty)) { - return null; + /** + * Selects the PIDs for each connector, unions these tables and does a countDistinct(pid) to obtain the concepts total entity count. + */ + private long collectEntityCount(final List connectors, final Optional childCondition) { + + final org.jooq.Table> entitiesUnioned = + union(connectors, connector -> createCountEntitiesQuery(connector, childCondition), Select::union, ENTITIES_TABLE); + + final SelectJoinStep> entitiesQuery = + dslContext.select(countDistinct(entitiesUnioned.field(ENTITIES)).as(ENTITIES)) + .from(entitiesUnioned); + + final Result result = executionService.fetch(entitiesQuery); + try { + // we will get an Integer as SQL return type of SUM select, but MatchingStats expect a long + final Integer value = (Integer) result.getValue(0, field(ENTITIES)); + return Objects.requireNonNull(value).longValue(); + } + catch (Exception e) { + log.error("Expecting exactly 1 column of type Integer and 1 row in Result when querying for events of a concept node. Error: ", e); + return 0; + } } - org.jooq.Table validityDatesUnioned = unionAllValidityDates(validityDateMap, childCondition); - // we just need any of the generated column date ranges to get the name of the unioned field(s) - ColumnDateRange anyOfTheUnionedDates = validityDateMap.get(connectors.get(0)).get(0); - // ensure we have a start and end field (and not a single-column range), because we need to get the min(start) and max(end) - ColumnDateRange dualColumn = functionProvider.toDualColumn(anyOfTheUnionedDates); - // the get the overall min and max - ColumnDateRange minAndMax = ColumnDateRange.of(min(dualColumn.getStart()), max(dualColumn.getEnd())); - // finally, we create the proper string expression which handles possible +/-infinity date values - Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(VALIDITY_DATE_SELECT); - SelectJoinStep> dateSpanQuery = dslContext.select(validityDateExpression) - .from(validityDatesUnioned); - - Result result = executionService.fetch(dateSpanQuery); - try (ResultSet resultSet = result.intoResultSet()) { - - // If no values were encountered this the result is empty: Table might be empty, or condition does not match any node. - if (!resultSet.isBeforeFirst()) { + private SelectConditionStep> createCountEntitiesQuery(final Connector connector, final Optional childCondition) { + final Field primaryColumn = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(ENTITIES); + final Table connectorTable = table(name(connector.getResolvedTable().getName())); + final Condition connectorCondition = toJooqCondition(connector, childCondition); + return dslContext.select(primaryColumn) + .from(connectorTable) + .where(connectorCondition); + } + + /** + * For each connector and each of its validity dates, we select the start and end date, union all these tables and select the min(start) and max(end) + * to obtain the concepts total date span. + * + * @return A {@link CDateRange} with the min and max validity date over all the given connectors. Null, if the given connectors have no validity date at all. + */ + private CDateRange collectDateSpan(final List connectors, final Optional childCondition) { + + final Map> validityDateMap = connectors.stream().collect( + // we create all validity dates with the same alias to union them later + Collectors.toMap(Function.identity(), this::createColumnDateRanges) + ); + if (validityDateMap.values().stream().allMatch(List::isEmpty)) { return null; } - resultSet.next(); // we advance to first line of the ResultSet - List dateRange = executionService.getResultSetProcessor().getDateRange(resultSet, 1); - return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : null; - } - catch (Exception e) { - log.error("Expecting exactly 1 column containing a daterange expression when querying for the date span of a concept. Error: ", e); - return null; + final org.jooq.Table validityDatesUnioned = unionAllValidityDates(validityDateMap, childCondition); + // we just need any of the generated column date ranges to get the name of the unioned field(s) + final ColumnDateRange anyOfTheUnionedDates = validityDateMap.get(connectors.get(0)).get(0); + // ensure we have a start and end field (and not a single-column range), because we need to get the min(start) and max(end) + final ColumnDateRange dualColumn = functionProvider.toDualColumn(anyOfTheUnionedDates); + // the get the overall min and max + final ColumnDateRange minAndMax = ColumnDateRange.of(min(dualColumn.getStart()), max(dualColumn.getEnd())); + // finally, we create the proper string expression which handles possible +/-infinity date values + final Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(DATES); + final SelectJoinStep> dateSpanQuery = dslContext.select(validityDateExpression) + .from(validityDatesUnioned); + + final Result result = executionService.fetch(dateSpanQuery); + try { + final String dateExpression = (String) result.getValue(0, field(DATES)); + return toDateRange(dateExpression); + } + catch (Exception e) { + log.error("Expecting exactly 1 column containing a daterange expression when querying for the date span of a concept. Error: ", e); + return null; + } } - } - private List createColumnDateRanges(Connector connector, String alias) { - return connector.getValidityDates().stream() - .map(functionProvider::forValidityDate) - .map(daterange -> daterange.as(alias)) - .toList(); - } + private List createColumnDateRanges(final Connector connector) { + return connector.getValidityDates().stream() + .map(functionProvider::forValidityDate) + .map(daterange -> daterange.as(DATES.last())) + .toList(); + } - private org.jooq.Table unionAllValidityDates(Map> validityDateMap, Optional childCondition) { - return validityDateMap.entrySet().stream() - .flatMap(entry -> { - Connector connector = entry.getKey(); - List validityDates = entry.getValue(); - return validityDates.stream().map(columnDateRange -> createValidityDateQuery(columnDateRange, connector, childCondition)); - }) - .reduce((validityDate1, validityDate2) -> (SelectConditionStep) validityDate1.unionAll(validityDate2)) - .orElseThrow(() -> new RuntimeException("Expected at least 1 validity date to be present.")) - .asTable(name(VALIDITY_DATES_TABLE)); - } + private org.jooq.Table unionAllValidityDates(final Map> validityDateMap, final Optional childCondition) { + return validityDateMap.entrySet().stream() + .flatMap(entry -> { + Connector connector = entry.getKey(); + List validityDates = entry.getValue(); + return validityDates.stream().map(columnDateRange -> createValidityDateQuery(columnDateRange, connector, childCondition)); + }) + .reduce((validityDate1, validityDate2) -> (SelectConditionStep) validityDate1.unionAll(validityDate2)) + .orElseThrow(() -> new RuntimeException("Expected at least 1 validity date to be present.")) + .asTable(name(VALIDITY_DATES_TABLE)); + } - private SelectConditionStep createValidityDateQuery(ColumnDateRange columnDateRange, Connector connector, Optional childCondition) { - return dslContext.select(columnDateRange.toFields()) - .from(table(name(connector.getResolvedTable().getName()))) - .where(toJooqCondition(connector, childCondition)); - } + private SelectConditionStep createValidityDateQuery( + final ColumnDateRange columnDateRange, + final Connector connector, + final Optional childCondition + ) { + return dslContext.select(columnDateRange.toFields()) + .from(table(name(connector.getResolvedTable().getName()))) + .where(toJooqCondition(connector, childCondition)); + } - private static org.jooq.Table union( - Collection input, - Function> mapper, - BinaryOperator> operator, - String tableName - ) { - return input.stream() - .map(mapper) - .reduce(operator) - .orElseThrow(() -> new IllegalStateException("Expected at least one element to union")) - .asTable(name(tableName)); - } + private static org.jooq.Table union( + final Collection input, + final Function> mapper, + final BinaryOperator> operator, + final String tableName + ) { + return input.stream() + .map(mapper) + .reduce(operator) + .orElseThrow(() -> new IllegalStateException("Expected at least one element to union")) + .asTable(name(tableName)); + } - private Condition toJooqCondition(Connector connector, Optional childCondition) { - CTConditionContext context = CTConditionContext.create(connector, functionProvider); - return childCondition.or(() -> Optional.ofNullable(connector.getCondition())) - .map(condition -> condition.convertToSqlCondition(context).condition()) - .orElse(noCondition()); + private Condition toJooqCondition(final Connector connector, final Optional childCondition) { + final CTConditionContext context = CTConditionContext.create(connector, functionProvider); + return childCondition.or(() -> Optional.ofNullable(connector.getCondition())) + .map(condition -> condition.convertToSqlCondition(context).condition()) + .orElse(noCondition()); + } } @RequiredArgsConstructor - private class SqlMatchingStatsTask implements Callable { + private class AllEqualConditionsTask implements Callable { - private final List connectors; - private final ConceptElement treeNode; + private final TreeConcept treeConcept; @Override public Void call() { - Optional childCondition = treeNode instanceof ConceptTreeChild treeChild - ? Optional.of(treeChild.getCondition()) - : Optional.empty(); - long events = collectEventCount(connectors, childCondition); - long entities = collectEntityCount(connectors, childCondition); - CDateRange span = collectDateSpan(connectors, childCondition); - - SqlMatchingStats matchingStats = new SqlMatchingStats(events, entities, span); - treeNode.setMatchingStats(matchingStats); + final Map> validityDateMap = createColumnDateRanges(treeConcept); + final Select unioned = treeConcept.getConnectors().stream() + .map(connector -> this.createConnectorQuery(connector, validityDateMap)) + .reduce(Select::unionAll) + .orElseThrow(IllegalStateException::new); + + final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); + final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); + final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); + final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); + final Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(DATES); + + final Select query = dslContext.select( + field(CONNECTOR_COLUMN), + count(asterisk()).as(EVENTS), + countDistinct(field(ENTITIES)).as(ENTITIES), + validityDateExpression + ) + .from(unioned) + .groupBy(field(CONNECTOR_COLUMN)); + + final Map groupValueToStats = executionService.fetchStream(query).collect(Collectors.toMap( + record -> record.get(CONNECTOR_COLUMN, String.class), + record -> { + final long events = record.get(EVENTS, Integer.class).longValue(); + final long entities = record.get(ENTITIES, Integer.class).longValue(); + final CDateRange dateSpan = toDateRange(record.get(DATES, String.class)); + return new SqlMatchingStats(events, entities, dateSpan); + } + )); + + treeConcept.setMatchingStats(groupValueToStats.values().stream().reduce(SqlMatchingStats::add).orElseThrow(IllegalStateException::new)); + setAndAggregate(treeConcept.getChildren(), groupValueToStats); return null; } + + private Map> createColumnDateRanges(final TreeConcept treeConcept) { + final AtomicInteger counter = new AtomicInteger(0); + return treeConcept.getConnectors().stream().collect(Collectors.toMap( + Function.identity(), + connector -> createColumnDateRanges(connector, counter) + )); + } + + private List createColumnDateRanges(final Connector connector, final AtomicInteger counter) { + return connector.getValidityDates().stream() + .map(functionProvider::forValidityDate) + .map(daterange -> daterange.as("%s-%d".formatted(SharedAliases.DATES_COLUMN.getAlias(), counter.incrementAndGet()))) + .toList(); + } + + private Select createConnectorQuery(final ConceptTreeConnector connector, final Map> validityDateMap) { + + final Table connectorTable = table(name(connector.getResolvedTable().getName())); + final Field connectorColumn = field(name(connectorTable.getName(), connector.getColumn().resolve().getName())).as(CONNECTOR_COLUMN); + final Field primaryKey = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(ENTITIES); + + // we have to select all possible validity dates of all connectors because we have to union multiple connectors + final Stream> validityDates = + validityDateMap.entrySet().stream().flatMap(entry -> entry.getValue().stream() + .map(columnDateRange -> entry.getKey() == connector + ? columnDateRange + : functionProvider.nulled(columnDateRange)) + .flatMap(columnDateRange -> columnDateRange.toFields().stream())); + + return dslContext.select(Stream.concat(Stream.of(connectorColumn, primaryKey), validityDates).toList()) + .from(connectorTable); + } + + private void setAndAggregate(final List children, final Map groupValueToStats) { + children.forEach(child -> { + final SqlMatchingStats nodeStats = new SqlMatchingStats(); + // node is leaf + if (child.getChildren().isEmpty()) { + collectByCondition(groupValueToStats, child, nodeStats); + } + else { + // although node is not a leaf, it can have a condition + if (child.getCondition() != null) { + collectByCondition(groupValueToStats, child, nodeStats); + } + // recursively collect matching stats of children + setAndAggregate(child.getChildren(), groupValueToStats); + } + child.setMatchingStats(nodeStats); + }); + } + + private static void collectByCondition(final Map groupValueToStats, final ConceptTreeChild node, final SqlMatchingStats nodeStats) { + final EqualCondition condition = (EqualCondition) node.getCondition(); + condition.getValues().forEach(val -> { + final SqlMatchingStats statsForCondition = groupValueToStats.get(val); + // not all possible conditions must have a corresponding value in database which the results have been grouped by + if (statsForCondition == null) { + return; + } + nodeStats.add(statsForCondition); + }); + } + } } diff --git a/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/HanaSqlFunctionProvider.java b/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/HanaSqlFunctionProvider.java index 1dcd4286cc..2e644ae12f 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/HanaSqlFunctionProvider.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/HanaSqlFunctionProvider.java @@ -89,7 +89,8 @@ public ColumnDateRange forCDateRange(CDateRange daterange) { if (daterange.hasUpperBound()) { // end date is expected to be handled as exclusive, but if it's already the maximum date, we can't add +1 day if (Objects.equals(daterange.getMax(), LocalDate.ofEpochDay(CDateRange.POSITIVE_INFINITY))) { - throw new UnsupportedOperationException("Given daterange has an upper bound of CDateRange.POSITIVE_INFINITY, which is not supported by ConQuery's HANA dialect."); + throw new UnsupportedOperationException( + "Given daterange has an upper bound of CDateRange.POSITIVE_INFINITY, which is not supported by ConQuery's HANA dialect."); } LocalDate exclusiveMaxDate = daterange.getMax().plusDays(1); endDateExpression = exclusiveMaxDate.toString(); @@ -147,6 +148,11 @@ public ColumnDateRange aggregated(ColumnDateRange columnDateRange) { .as(columnDateRange.getAlias()); } + @Override + public ColumnDateRange nulled(ColumnDateRange columnDateRange) { + return ColumnDateRange.of(toDateField(null), toDateField(null)).as(columnDateRange.getAlias()); + } + @Override public ColumnDateRange toDualColumn(ColumnDateRange columnDateRange) { // HANA does not support single column ranges diff --git a/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/PostgreSqlFunctionProvider.java b/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/PostgreSqlFunctionProvider.java index 0201bd8c73..eca8ca1452 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/PostgreSqlFunctionProvider.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/PostgreSqlFunctionProvider.java @@ -135,6 +135,12 @@ public ColumnDateRange aggregated(ColumnDateRange columnDateRange) { return ColumnDateRange.of(rangeAgg(columnDateRange)).as(columnDateRange.getAlias()); } + @Override + public ColumnDateRange nulled(ColumnDateRange columnDateRange) { + ensureIsSingleColumnRange(columnDateRange); + return ColumnDateRange.of(DSL.field("null::daterange")).as(columnDateRange.getAlias()); + } + @Override public ColumnDateRange toDualColumn(ColumnDateRange columnDateRange) { Field daterange = columnDateRange.getRange(); diff --git a/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/SqlFunctionProvider.java b/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/SqlFunctionProvider.java index 46572215ea..f6fd446e9f 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/SqlFunctionProvider.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/conversion/dialect/SqlFunctionProvider.java @@ -84,6 +84,11 @@ public interface SqlFunctionProvider { ColumnDateRange aggregated(ColumnDateRange columnDateRange); + /** + * Create an aliased null-value {@link ColumnDateRange} from the given range. Example: {@code null::daterange as "date_range"} + */ + ColumnDateRange nulled(ColumnDateRange columnDateRange); + /** * Given a single-column {@link ColumnDateRange}, it will create a new {@link ColumnDateRange} with a start and end field. * For dialects that don't support single-column ranges, it will create a copy of the given {@link ColumnDateRange}. diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java index b17d3db261..b65541af85 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java @@ -17,7 +17,8 @@ class DefaultResultSetProcessor implements ResultSetProcessor { private final ConqueryConfig config; - private final SqlCDateSetParser sqlCDateSetParser; + @lombok.Getter + private final SqlCDateSetParser cDateSetParser; @Override public String getString(ResultSet resultSet, int columnIndex) throws SQLException { @@ -60,12 +61,12 @@ public Integer getDate(ResultSet resultSet, int columnIndex) throws SQLException @Override public List getDateRange(ResultSet resultSet, int columnIndex) throws SQLException { - return this.sqlCDateSetParser.toEpochDayRange(resultSet.getString(columnIndex)); + return this.cDateSetParser.toEpochDayRange(resultSet.getString(columnIndex)); } @Override public List> getDateRangeList(ResultSet resultSet, int columnIndex) throws SQLException { - return this.sqlCDateSetParser.toEpochDayRangeList(resultSet.getString(columnIndex)); + return this.cDateSetParser.toEpochDayRangeList(resultSet.getString(columnIndex)); } @Override diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java index a354a92f10..8f7bf0c839 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java @@ -47,7 +47,7 @@ public List toEpochDayRange(String daterange) { } String[] dates = daterange.split(DATE_SEPARATOR); - Preconditions.checkArgument(dates.length == 2, "Dateranges must have a start and end."); + Preconditions.checkArgument(dates.length == 2, "Dateranges must have a start and end. Input was: %s".formatted(daterange)); // the dateranges have always an included start date marked by a [ String startDateExpression = dates[0]; diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java index 074716073d..a7e6498751 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java @@ -9,6 +9,8 @@ public interface ResultSetProcessor { char UNIT_SEPARATOR = (char) 31; // https://www.ascii-code.com/character/%E2%90%9F + SqlCDateSetParser getCDateSetParser(); + String getString(ResultSet resultSet, int columnIndex) throws SQLException; Integer getInteger(ResultSet resultSet, int columnIndex) throws SQLException; From cecd7913d18147fd196f87ead15996f2ad6dfbcc Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Fri, 25 Oct 2024 12:38:32 +0200 Subject: [PATCH 06/31] Revert "Deactivate mappings and filter job" This reverts commit 63a5832bc2a922ed1232509ab282b59f53226aa9. --- .../java/com/bakdata/conquery/models/worker/Namespace.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java index 327e06f5d6..7f112044de 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java @@ -134,9 +134,9 @@ public void postprocessData() { getJobManager().addSlowJob(new SimpleJob( "Initiate Update Matching Stats and FilterSearch", () -> { - // updateInternToExternMappings(); + updateInternToExternMappings(); updateMatchingStats(); - // updateFilterSearch(); + updateFilterSearch(); } )); From 7e2105297b543dc0d91cb126b29417ba9584a77f Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Fri, 25 Oct 2024 13:19:13 +0200 Subject: [PATCH 07/31] Revert changes to MetadataCollectionTest --- .../tests/MetadataCollectionTest.java | 59 ++++++++----------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java b/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java index e76ec7ad19..c6573cb181 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java @@ -3,16 +3,17 @@ import static org.assertj.core.api.Assertions.assertThat; import java.time.LocalDate; -import java.util.Set; import com.bakdata.conquery.integration.IntegrationTest; import com.bakdata.conquery.integration.json.ConqueryTestSpec; import com.bakdata.conquery.integration.json.JsonIntegrationTest; import com.bakdata.conquery.models.common.daterange.CDateRange; -import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeChild; +import com.bakdata.conquery.models.datasets.concepts.Concept; import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; import com.bakdata.conquery.models.exceptions.ValidatorHelper; import com.bakdata.conquery.models.identifiable.ids.specific.DatasetId; +import com.bakdata.conquery.models.messages.namespaces.specific.UpdateMatchingStatsMessage; +import com.bakdata.conquery.models.worker.DistributedNamespace; import com.bakdata.conquery.util.support.StandaloneSupport; import com.github.powerlibraries.io.In; import lombok.extern.slf4j.Slf4j; @@ -20,51 +21,39 @@ @Slf4j public class MetadataCollectionTest extends IntegrationTest.Simple implements ProgrammaticIntegrationTest { - @Override - public Set forModes() { - return Set.of(StandaloneSupport.Mode.WORKER, StandaloneSupport.Mode.SQL); - } - @Override public void execute(StandaloneSupport conquery) throws Exception { + //read test sepcification + String testJson = In.resource("/tests/query/SIMPLE_TREECONCEPT_QUERY/SIMPLE_TREECONCEPT_Query.test.json").withUTF8().readAll(); - //read test specification - String testJson = In.resource("/shared/icd.test.json").withUTF8().readAll(); DatasetId dataset = conquery.getDataset().getId(); + ConqueryTestSpec test = JsonIntegrationTest.readJson(dataset, testJson); ValidatorHelper.failOnError(log, conquery.getValidator().validate(test)); + test.importRequiredData(conquery); - // triggers update matching stats - conquery.getNamespace().postprocessData(); + //ensure the metadata is collected + DistributedNamespace namespace = (DistributedNamespace) conquery.getNamespace(); + namespace.getWorkerHandler() + .sendToAll(new UpdateMatchingStatsMessage(conquery.getNamespace().getStorage().getAllConcepts().map(Concept::getId).toList())); + conquery.waitUntilWorkDone(); - TreeConcept concept = (TreeConcept) conquery.getNamespace().getStorage().getAllConcepts().toList().iterator().next(); - //check the number of matched events from root node to the deepest child node - assertThat(concept.getMatchingStats().countEvents()).isEqualTo(10); - assertThat(concept.getMatchingStats().countEntities()).isEqualTo(3); - // concepts 1. child (F00-F99) - ConceptTreeChild f00_99 = concept.getChildren().get(0); - assertThat(f00_99.getMatchingStats().countEvents()).isEqualTo(8); - assertThat(f00_99.getMatchingStats().countEntities()).isEqualTo(3); - // 1. child's child (F20-29) - ConceptTreeChild f20_29 = f00_99.getChildren().get(0); - assertThat(f20_29.getMatchingStats().countEvents()).isEqualTo(7); - assertThat(f20_29.getMatchingStats().countEntities()).isEqualTo(2); - // 1. child's child's child (yeah it's getting wild) - ConceptTreeChild f20 = f20_29.getChildren().get(0); - assertThat(f20.getMatchingStats().countEvents()).isEqualTo(5); - assertThat(f20.getMatchingStats().countEntities()).isEqualTo(1); - // 1. child's child's child's children (I promise it won't get worse) - assertThat(f20.getChildren()).allSatisfy(child -> { - assertThat(child.getMatchingStats().countEvents()).isEqualTo(1); - assertThat(child.getMatchingStats().countEntities()).isEqualTo(1); - }); + TreeConcept concept = (TreeConcept) conquery.getNamespace().getStorage().getAllConcepts().iterator().next(); + //check the number of matched events + assertThat(concept.getMatchingStats().countEvents()).isEqualTo(4); + assertThat(concept.getChildren()).allSatisfy(c -> { + assertThat(c.getMatchingStats().countEvents()).isEqualTo(2); + }); + //check the date ranges assertThat(concept.getMatchingStats().spanEvents()) - .isEqualTo(CDateRange.of(LocalDate.parse("2009-05-18"), LocalDate.parse("2023-08-20"))); - assertThat(f20.getMatchingStats().spanEvents()) - .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-01"), LocalDate.parse("2023-02-18"))); + .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-15"), LocalDate.parse("2013-11-10"))); + assertThat(concept.getChildren().get(0).getMatchingStats().spanEvents()) + .isEqualTo(CDateRange.of(LocalDate.parse("2012-01-01"), LocalDate.parse("2013-11-10"))); + assertThat(concept.getChildren().get(1).getMatchingStats().spanEvents()) + .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-15"), LocalDate.parse("2012-11-11"))); } } From 52d097e55378b2b1dec3036517950ec331ea41ab Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Fri, 25 Oct 2024 13:23:08 +0200 Subject: [PATCH 08/31] Revert changes to ConceptTreeNode --- .../models/datasets/concepts/tree/ConceptTreeNode.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/tree/ConceptTreeNode.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/tree/ConceptTreeNode.java index 6c6bd91b62..75f5d9d532 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/tree/ConceptTreeNode.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/tree/ConceptTreeNode.java @@ -3,6 +3,7 @@ import java.util.List; import com.bakdata.conquery.models.datasets.concepts.ConceptElement; +import com.bakdata.conquery.models.datasets.concepts.MatchingStats; import com.bakdata.conquery.models.identifiable.Named; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptElementId; import com.fasterxml.jackson.annotation.JsonBackReference; @@ -13,20 +14,16 @@ public interface ConceptTreeNode getChildren(); - int getLocalId(); - int getDepth(); - @JsonIgnore int[] getPrefix(); - @JsonBackReference ConceptTreeNode getParent(); - void setLocalId(int size); - String getDescription(); + MatchingStats getMatchingStats(); + String getDescription(); String getLabel(); } From 7f7523cbb79636d0744c027d9f0123554ebcd611 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Sat, 26 Oct 2024 10:03:22 +0200 Subject: [PATCH 09/31] Simplify condition match methods --- .../mode/local/SqlUpdateMatchingStatsJob.java | 98 +++++++++++-------- 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index 87127ebd3e..52be447d1f 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -5,6 +5,7 @@ import java.math.BigDecimal; import java.sql.Date; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; @@ -30,6 +31,7 @@ import com.bakdata.conquery.models.datasets.concepts.conditions.CTCondition; import com.bakdata.conquery.models.datasets.concepts.conditions.EqualCondition; import com.bakdata.conquery.models.datasets.concepts.conditions.PrefixCondition; +import com.bakdata.conquery.models.datasets.concepts.conditions.PrefixRangeCondition; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeChild; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeConnector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; @@ -100,8 +102,7 @@ public void execute() throws Exception { log.debug("BEGIN update Matching stats for {} Concepts.", concepts.size()); final List regularApproach = new ArrayList<>(); - final List onlyEqualConditions = new ArrayList<>(); - final List prefixConcept = new ArrayList<>(); + final List onlyEqualOrPrefixConditions = new ArrayList<>(); concepts.stream() .parallel() @@ -109,27 +110,25 @@ public void execute() throws Exception { .filter(SqlUpdateMatchingStatsJob::isTreeConcept) .forEach(concept -> { final TreeConcept treeConcept = (TreeConcept) concept; - if (treeConcept.getChildren().isEmpty()) { + final List conceptChildren = treeConcept.getChildren(); + if (conceptChildren.isEmpty()) { regularApproach.add(treeConcept); } - else if (anyConditionMatches(treeConcept.getChildren(), PrefixCondition.class)) { - prefixConcept.add(treeConcept); - } - else if (allConditionsMatch(treeConcept.getChildren(), EqualCondition.class)) { - onlyEqualConditions.add(treeConcept); + else if (allConditionsOneOf(conceptChildren, List.of(EqualCondition.class, PrefixCondition.class, PrefixRangeCondition.class))) { + onlyEqualOrPrefixConditions.add(treeConcept); } else { regularApproach.add(treeConcept); } }); - log.info("Skipping matching stats calc for prefix concepts: {}", prefixConcept.stream().map(Concept::getName).toList()); + log.info("Matching Stats classification: regular => {}, onlyEqualOrPrefix => {}", regularApproach.size(), onlyEqualOrPrefixConditions.size()); final long startTime = System.currentTimeMillis(); final List> runningQueries = Stream.concat( regularApproach.stream().flatMap(concept -> walkAndCollectMatchingStats(concept.getConnectors(), concept)), - onlyEqualConditions.stream().map(AllEqualConditionsTask::new) + onlyEqualOrPrefixConditions.stream().map(AllEqualOrPrefixConditionsTask::new) ) .parallel() .map(executors::submit) @@ -160,27 +159,15 @@ private static boolean isTreeConcept(final Concept concept) { return true; } - private boolean allConditionsMatch(final List children, final Class condition) { + private boolean allConditionsOneOf(final List children, final List> conditions) { return children.stream().allMatch(child -> { if (child.getChildren().isEmpty()) { - return condition.isInstance(child.getCondition()); + return child.getCondition() != null && conditions.stream().anyMatch(condition -> condition.isInstance(child.getCondition())); } - if (child.getCondition() != null && !condition.isInstance(child.getCondition())) { + if (child.getCondition() != null && conditions.stream().noneMatch(condition -> condition.isInstance(child.getCondition()))) { return false; } - return allConditionsMatch(child.getChildren(), condition); - }); - } - - private boolean anyConditionMatches(final List children, final Class condition) { - return children.stream().anyMatch(child -> { - if (child.getChildren().isEmpty()) { - return condition.isInstance(child.getCondition()); - } - if (child.getCondition() != null && condition.isInstance(child.getCondition())) { - return true; - } - return anyConditionMatches(child.getChildren(), condition); + return allConditionsOneOf(child.getChildren(), conditions); }); } @@ -379,7 +366,7 @@ private Condition toJooqCondition(final Connector connector, final Optional { + private class AllEqualOrPrefixConditionsTask implements Callable { private final TreeConcept treeConcept; @@ -417,8 +404,8 @@ record -> { } )); - treeConcept.setMatchingStats(groupValueToStats.values().stream().reduce(SqlMatchingStats::add).orElseThrow(IllegalStateException::new)); - setAndAggregate(treeConcept.getChildren(), groupValueToStats); + treeConcept.setMatchingStats(groupValueToStats.values().stream().reduce(SqlMatchingStats::add).orElseGet(SqlMatchingStats::new)); + setAndAggregate(groupValueToStats, treeConcept.getChildren()); return null; } @@ -456,7 +443,7 @@ private Select createConnectorQuery(final ConceptTreeConnector connector, fin .from(connectorTable); } - private void setAndAggregate(final List children, final Map groupValueToStats) { + private void setAndAggregate(final Map groupValueToStats, final List children) { children.forEach(child -> { final SqlMatchingStats nodeStats = new SqlMatchingStats(); // node is leaf @@ -469,22 +456,55 @@ private void setAndAggregate(final List children, final Map groupValueToStats, final ConceptTreeChild node, final SqlMatchingStats nodeStats) { - final EqualCondition condition = (EqualCondition) node.getCondition(); - condition.getValues().forEach(val -> { - final SqlMatchingStats statsForCondition = groupValueToStats.get(val); - // not all possible conditions must have a corresponding value in database which the results have been grouped by - if (statsForCondition == null) { - return; - } + + // TODO make those methods of CTCondition and call directly on node.condition() + if (node.getCondition() instanceof EqualCondition equalCondition) { + equalCondition.getValues().forEach(val -> { + final SqlMatchingStats statsForCondition = groupValueToStats.getOrDefault(val, new SqlMatchingStats()); + nodeStats.add(statsForCondition); + }); + return; + } + else if (node.getCondition() instanceof PrefixCondition prefixCondition) { + Arrays.stream(prefixCondition.getPrefixes()).forEach(prefix -> { + final SqlMatchingStats statsForCondition = groupValueToStats.entrySet().stream() + .filter(entry -> entry.getKey().startsWith(prefix)) + .map(Map.Entry::getValue) + .reduce(SqlMatchingStats::add) + .orElseGet(SqlMatchingStats::new); + nodeStats.add(statsForCondition); + }); + return; + } + else if (node.getCondition() instanceof PrefixRangeCondition prefixRangeCondition) { + final SqlMatchingStats statsForCondition = groupValueToStats.entrySet().stream() + .filter(entry -> { + + final String groupValue = entry.getKey(); + final String min = prefixRangeCondition.getMin(); + final String max = prefixRangeCondition.getMax(); + + if (groupValue.length() < min.length()) { + return false; + } + + String pref = groupValue.substring(0, min.length()); + return min.compareTo(pref) <= 0 && max.compareTo(pref) >= 0; + }) + .map(Map.Entry::getValue) + .reduce(SqlMatchingStats::add) + .orElseGet(SqlMatchingStats::new); nodeStats.add(statsForCondition); - }); + return; + } + throw new IllegalArgumentException("Unsupported condition type: " + node.getCondition().getClass().getSimpleName()); } } From a87950c52cfec66e39556be6531000fede05533b Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Sun, 27 Oct 2024 11:19:37 +0100 Subject: [PATCH 10/31] Use SqlMatchingStats::empty for more fluent API --- .../com/bakdata/conquery/mode/local/SqlMatchingStats.java | 4 ++++ .../conquery/mode/local/SqlUpdateMatchingStatsJob.java | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java index e370dc7963..b5b283a6e1 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java @@ -17,6 +17,10 @@ public class SqlMatchingStats implements MatchingStats { private long numberOfEntities; private CDateRange span; + public static SqlMatchingStats empty() { + return new SqlMatchingStats(); + } + @Override public long countEvents() { return numberOfEvents; diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index 52be447d1f..d1d4bbaa4a 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -404,7 +404,7 @@ record -> { } )); - treeConcept.setMatchingStats(groupValueToStats.values().stream().reduce(SqlMatchingStats::add).orElseGet(SqlMatchingStats::new)); + treeConcept.setMatchingStats(groupValueToStats.values().stream().reduce(SqlMatchingStats::add).orElseGet(SqlMatchingStats::empty)); setAndAggregate(groupValueToStats, treeConcept.getChildren()); return null; @@ -467,7 +467,7 @@ private static void collectByCondition(final Map group // TODO make those methods of CTCondition and call directly on node.condition() if (node.getCondition() instanceof EqualCondition equalCondition) { equalCondition.getValues().forEach(val -> { - final SqlMatchingStats statsForCondition = groupValueToStats.getOrDefault(val, new SqlMatchingStats()); + final SqlMatchingStats statsForCondition = groupValueToStats.getOrDefault(val, SqlMatchingStats.empty()); nodeStats.add(statsForCondition); }); return; @@ -478,7 +478,7 @@ else if (node.getCondition() instanceof PrefixCondition prefixCondition) { .filter(entry -> entry.getKey().startsWith(prefix)) .map(Map.Entry::getValue) .reduce(SqlMatchingStats::add) - .orElseGet(SqlMatchingStats::new); + .orElseGet(SqlMatchingStats::empty); nodeStats.add(statsForCondition); }); return; @@ -500,7 +500,7 @@ else if (node.getCondition() instanceof PrefixRangeCondition prefixRangeConditio }) .map(Map.Entry::getValue) .reduce(SqlMatchingStats::add) - .orElseGet(SqlMatchingStats::new); + .orElseGet(SqlMatchingStats::empty); nodeStats.add(statsForCondition); return; } From de118668ef08a7fdd59c34a16826e32abb2aef4b Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Sun, 27 Oct 2024 15:44:22 +0100 Subject: [PATCH 11/31] Remove parallel calls --- .../bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index d1d4bbaa4a..71ce4ab7d8 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -105,7 +105,6 @@ public void execute() throws Exception { final List onlyEqualOrPrefixConditions = new ArrayList<>(); concepts.stream() - .parallel() .map(ConceptId::resolve) .filter(SqlUpdateMatchingStatsJob::isTreeConcept) .forEach(concept -> { @@ -130,7 +129,6 @@ else if (allConditionsOneOf(conceptChildren, List.of(EqualCondition.class, Prefi regularApproach.stream().flatMap(concept -> walkAndCollectMatchingStats(concept.getConnectors(), concept)), onlyEqualOrPrefixConditions.stream().map(AllEqualOrPrefixConditionsTask::new) ) - .parallel() .map(executors::submit) .toList(); From 4797e825a6faa3481603b5f55ab5688a38cbbd5d Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Tue, 29 Oct 2024 16:13:52 +0100 Subject: [PATCH 12/31] Use parallel in setAndAggregate --- .../conquery/mode/local/SqlUpdateMatchingStatsJob.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index 71ce4ab7d8..8cb34b2636 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -75,6 +75,7 @@ public class SqlUpdateMatchingStatsJob extends Job { private final SqlFunctionProvider functionProvider; private final Set concepts; private final ExecutorService executors; + private final AtomicInteger counter; public SqlUpdateMatchingStatsJob( DatabaseConfig databaseConfig, @@ -89,6 +90,7 @@ public SqlUpdateMatchingStatsJob( this.functionProvider = functionProvider; this.concepts = concepts; this.executors = executors; + this.counter = new AtomicInteger(0); } @Override @@ -138,7 +140,7 @@ else if (allConditionsOneOf(conceptChildren, List.of(EqualCondition.class, Prefi } final long timeElapsed = System.currentTimeMillis() - startTime; - log.info("DONE collecting matching stats. Elapsed time: {} ms", timeElapsed); + log.info("DONE collecting matching stats. Elapsed time: {} ms. Executed standard queries: {}", timeElapsed, counter.get()); runningQueries.forEach(SqlUpdateMatchingStatsJob::checkForError); } @@ -208,6 +210,7 @@ public Void call() { final SqlMatchingStats matchingStats = new SqlMatchingStats(events, entities, span); treeNode.setMatchingStats(matchingStats); + counter.incrementAndGet(); return null; } @@ -442,7 +445,7 @@ private Select createConnectorQuery(final ConceptTreeConnector connector, fin } private void setAndAggregate(final Map groupValueToStats, final List children) { - children.forEach(child -> { + children.stream().parallel().forEach(child -> { final SqlMatchingStats nodeStats = new SqlMatchingStats(); // node is leaf if (child.getChildren().isEmpty()) { From a8bf45c361e78b564bb43d47f97e8f8a348402e1 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 30 Oct 2024 14:22:09 +0100 Subject: [PATCH 13/31] Simplify SQL matching stats calc --- .../mode/cluster/WorkerMatchingStats.java | 132 ----- .../cluster/WorkerUpdateMatchingStatsJob.java | 9 +- .../conquery/mode/local/SqlMatchingStats.java | 86 --- .../mode/local/SqlUpdateMatchingStatsJob.java | 489 ++++++------------ .../datasets/concepts/MatchingStats.java | 122 ++++- .../concepts/conditions/AndCondition.java | 17 +- .../concepts/conditions/CTCondition.java | 12 +- .../conditions/ColumnEqualCondition.java | 21 +- .../concepts/conditions/EqualCondition.java | 19 +- .../concepts/conditions/GroovyCondition.java | 8 +- .../conditions/IsPresentCondition.java | 13 +- .../concepts/conditions/NotCondition.java | 15 +- .../concepts/conditions/OrCondition.java | 14 +- .../concepts/conditions/PrefixCondition.java | 15 +- .../conditions/PrefixRangeCondition.java | 13 + .../specific/UpdateElementMatchingStats.java | 14 +- .../concepts/tree/MatchingStatsTests.java | 20 +- 17 files changed, 426 insertions(+), 593 deletions(-) delete mode 100644 backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerMatchingStats.java delete mode 100644 backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java diff --git a/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerMatchingStats.java b/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerMatchingStats.java deleted file mode 100644 index a5650c4587..0000000000 --- a/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerMatchingStats.java +++ /dev/null @@ -1,132 +0,0 @@ -package com.bakdata.conquery.mode.cluster; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -import com.bakdata.conquery.models.common.daterange.CDateRange; -import com.bakdata.conquery.models.datasets.Column; -import com.bakdata.conquery.models.datasets.Table; -import com.bakdata.conquery.models.datasets.concepts.MatchingStats; -import com.bakdata.conquery.models.events.Bucket; -import com.bakdata.conquery.models.identifiable.ids.specific.WorkerId; -import com.fasterxml.jackson.annotation.JsonIgnore; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; - -@Getter -@Setter -public class WorkerMatchingStats implements MatchingStats { - - private Map entries = new HashMap<>(); - - @JsonIgnore - private transient CDateRange span; - - @JsonIgnore - private transient long numberOfEvents = -1L; - - @JsonIgnore - private transient long numberOfEntities = -1L; - - public long countEvents() { - if (numberOfEvents == -1L) { - synchronized (this) { - if (numberOfEvents == -1L) { - numberOfEvents = entries.values().stream().mapToLong(Entry::getNumberOfEvents).sum(); - } - } - } - return numberOfEvents; - } - - - public long countEntities() { - if (numberOfEntities == -1L) { - synchronized (this) { - if (numberOfEntities == -1L) { - numberOfEntities = entries.values().stream().mapToLong(Entry::getNumberOfEntities).sum(); - } - } - } - return numberOfEntities; - } - - public CDateRange spanEvents() { - if (span == null) { - synchronized (this) { - if (span == null) { - span = entries.values().stream().map(Entry::getSpan).reduce(CDateRange.all(), CDateRange::spanClosed); - } - } - } - return span; - - } - - public void putEntry(WorkerId source, Entry entry) { - synchronized (this) { - entries.put(source, entry); - span = null; - numberOfEntities = -1L; - numberOfEvents = -1L; - } - } - - @Data - @NoArgsConstructor - @AllArgsConstructor - public static class Entry { - private long numberOfEvents; - - @JsonIgnore - private final Set foundEntities = new HashSet<>(); - private long numberOfEntities; - private int minDate = Integer.MAX_VALUE; - private int maxDate = Integer.MIN_VALUE; - - @JsonIgnore - public CDateRange getSpan() { - if (minDate == Integer.MAX_VALUE && maxDate == Integer.MIN_VALUE) { - return null; - } - - return CDateRange.of( - minDate == Integer.MAX_VALUE ? Integer.MIN_VALUE : minDate, - maxDate == Integer.MIN_VALUE ? Integer.MAX_VALUE : maxDate - ); - } - - public void addEvent(Table table, Bucket bucket, int event, String entityForEvent) { - numberOfEvents++; - if (foundEntities.add(entityForEvent)) { - numberOfEntities++; - } - - for (Column c : table.getColumns()) { - if (!c.getType().isDateCompatible()) { - continue; - } - - if (!bucket.has(event, c)) { - continue; - } - - final CDateRange time = bucket.getAsDateRange(event, c); - - if (time.hasUpperBound()) { - maxDate = Math.max(time.getMaxValue(), maxDate); - } - - if (time.hasLowerBound()) { - minDate = Math.min(time.getMinValue(), minDate); - } - } - } - } - -} diff --git a/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java index 7e3b7e92a4..71d65a3ffc 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java @@ -13,6 +13,7 @@ import com.bakdata.conquery.models.datasets.concepts.Concept; import com.bakdata.conquery.models.datasets.concepts.ConceptElement; import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.datasets.concepts.MatchingStats; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; import com.bakdata.conquery.models.events.Bucket; @@ -51,7 +52,7 @@ public void execute() throws Exception { .collect(Collectors.toMap(Functions.identity(), concept -> CompletableFuture.runAsync(() -> { final Concept resolved = concept.resolve(); - final Map, WorkerMatchingStats.Entry> matchingStats = new HashMap<>(resolved.countElements()); + final Map, MatchingStats.Entry> matchingStats = new HashMap<>(resolved.countElements()); calculateConceptMatches(resolved, matchingStats, worker); worker.send(new UpdateElementMatchingStats(worker.getInfo().getId(), matchingStats)); @@ -100,7 +101,7 @@ public String getLabel() { return String.format("Calculate Matching Stats for %s", worker.getInfo().getDataset()); } - private static void calculateConceptMatches(Concept concept, Map, WorkerMatchingStats.Entry> results, Worker worker) { + private static void calculateConceptMatches(Concept concept, Map, MatchingStats.Entry> results, Worker worker) { log.debug("BEGIN calculating for `{}`", concept.getId()); for (CBlock cBlock : worker.getStorage().getAllCBlocks().toList()) { @@ -123,7 +124,7 @@ private static void calculateConceptMatches(Concept concept, Map new WorkerMatchingStats.Entry()).addEvent(table, bucket, event, entity); + results.computeIfAbsent(concept.getId(), (ignored) -> new MatchingStats.Entry()).addEvent(table, bucket, event, entity); continue; } @@ -134,7 +135,7 @@ private static void calculateConceptMatches(Concept concept, Map element = ((TreeConcept) concept).getElementByLocalIdPath(localIds); while (element != null) { - results.computeIfAbsent(((ConceptElement) element).getId(), (ignored) -> new WorkerMatchingStats.Entry()) + results.computeIfAbsent(((ConceptElement) element).getId(), (ignored) -> new MatchingStats.Entry()) .addEvent(table, bucket, event, entity); element = element.getParent(); } diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java deleted file mode 100644 index b5b283a6e1..0000000000 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlMatchingStats.java +++ /dev/null @@ -1,86 +0,0 @@ -package com.bakdata.conquery.mode.local; - -import java.time.LocalDate; - -import com.bakdata.conquery.models.common.daterange.CDateRange; -import com.bakdata.conquery.models.datasets.concepts.MatchingStats; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.NoArgsConstructor; - -@Data -@NoArgsConstructor -@AllArgsConstructor -public class SqlMatchingStats implements MatchingStats { - - private long numberOfEvents; - private long numberOfEntities; - private CDateRange span; - - public static SqlMatchingStats empty() { - return new SqlMatchingStats(); - } - - @Override - public long countEvents() { - return numberOfEvents; - } - - @Override - public long countEntities() { - return numberOfEntities; - } - - @Override - public CDateRange spanEvents() { - return span; - } - - public SqlMatchingStats add(SqlMatchingStats other) { - - this.numberOfEvents += other.numberOfEvents; - this.numberOfEntities += other.numberOfEntities; - - if ((this.span == null && other.span == null) || this.span != null && other.span == null) { - return this; - } - else if (this.span == null) { - this.span = other.span; - return this; - } - else { - - final LocalDate thisMin = this.span.getMin(); - final LocalDate otherMin = other.getSpan().getMin(); - final LocalDate min; - - if (thisMin == null) { - min = otherMin; - } - else if (otherMin == null) { - min = thisMin; - } - else { - min = thisMin.isBefore(otherMin) ? thisMin : otherMin; - } - - final LocalDate thisMax = this.span.getMax(); - final LocalDate otherMax = other.getSpan().getMax(); - final LocalDate max; - - if (thisMax == null) { - max = otherMax; - } - else if (otherMax == null) { - max = thisMax; - } - else { - max = thisMax.isAfter(otherMax) ? thisMax : otherMax; - } - - this.span = CDateRange.of(min, max); - return this; - } - } - -} diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index 8cb34b2636..3b532ee164 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -2,14 +2,10 @@ import static org.jooq.impl.DSL.*; -import java.math.BigDecimal; import java.sql.Date; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; +import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.concurrent.Callable; @@ -18,24 +14,23 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.BinaryOperator; import java.util.function.Function; import java.util.stream.Collectors; -import java.util.stream.Stream; import com.bakdata.conquery.models.common.daterange.CDateRange; import com.bakdata.conquery.models.config.DatabaseConfig; import com.bakdata.conquery.models.datasets.concepts.Concept; import com.bakdata.conquery.models.datasets.concepts.ConceptElement; import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.datasets.concepts.MatchingStats; import com.bakdata.conquery.models.datasets.concepts.conditions.CTCondition; -import com.bakdata.conquery.models.datasets.concepts.conditions.EqualCondition; -import com.bakdata.conquery.models.datasets.concepts.conditions.PrefixCondition; -import com.bakdata.conquery.models.datasets.concepts.conditions.PrefixRangeCondition; +import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeCache; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeChild; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeConnector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; +import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; +import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; import com.bakdata.conquery.models.jobs.Job; import com.bakdata.conquery.sql.conversion.SharedAliases; @@ -43,6 +38,7 @@ import com.bakdata.conquery.sql.conversion.dialect.SqlFunctionProvider; import com.bakdata.conquery.sql.conversion.model.ColumnDateRange; import com.bakdata.conquery.sql.execution.SqlExecutionService; +import com.bakdata.conquery.util.CalculatedValue; import com.bakdata.conquery.util.TablePrimaryColumnUtil; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -51,11 +47,7 @@ import org.jooq.Field; import org.jooq.Name; import org.jooq.Record; -import org.jooq.Record1; -import org.jooq.Result; import org.jooq.Select; -import org.jooq.SelectConditionStep; -import org.jooq.SelectJoinStep; import org.jooq.Table; @Slf4j @@ -63,11 +55,8 @@ public class SqlUpdateMatchingStatsJob extends Job { private static final Name CONNECTOR_COLUMN = name("connector_column"); private static final Name EVENTS = name("events"); - private static final String EVENTS_TABLE = "events_unioned"; private static final Name ENTITIES = name("entities"); - private static final String ENTITIES_TABLE = "entities"; - private static final Name DATES = name("unioned"); - private static final String VALIDITY_DATES_TABLE = "validity_dates"; + private static final Name DATES = name("dates"); private final DatabaseConfig databaseConfig; private final SqlExecutionService executionService; @@ -75,7 +64,6 @@ public class SqlUpdateMatchingStatsJob extends Job { private final SqlFunctionProvider functionProvider; private final Set concepts; private final ExecutorService executors; - private final AtomicInteger counter; public SqlUpdateMatchingStatsJob( DatabaseConfig databaseConfig, @@ -90,7 +78,6 @@ public SqlUpdateMatchingStatsJob( this.functionProvider = functionProvider; this.concepts = concepts; this.executors = executors; - this.counter = new AtomicInteger(0); } @Override @@ -103,36 +90,14 @@ public void execute() throws Exception { log.debug("BEGIN update Matching stats for {} Concepts.", concepts.size()); - final List regularApproach = new ArrayList<>(); - final List onlyEqualOrPrefixConditions = new ArrayList<>(); - - concepts.stream() - .map(ConceptId::resolve) - .filter(SqlUpdateMatchingStatsJob::isTreeConcept) - .forEach(concept -> { - final TreeConcept treeConcept = (TreeConcept) concept; - final List conceptChildren = treeConcept.getChildren(); - if (conceptChildren.isEmpty()) { - regularApproach.add(treeConcept); - } - else if (allConditionsOneOf(conceptChildren, List.of(EqualCondition.class, PrefixCondition.class, PrefixRangeCondition.class))) { - onlyEqualOrPrefixConditions.add(treeConcept); - } - else { - regularApproach.add(treeConcept); - } - }); - - log.info("Matching Stats classification: regular => {}, onlyEqualOrPrefix => {}", regularApproach.size(), onlyEqualOrPrefixConditions.size()); - final long startTime = System.currentTimeMillis(); - final List> runningQueries = - Stream.concat( - regularApproach.stream().flatMap(concept -> walkAndCollectMatchingStats(concept.getConnectors(), concept)), - onlyEqualOrPrefixConditions.stream().map(AllEqualOrPrefixConditionsTask::new) - ) - .map(executors::submit) - .toList(); + final List> runningQueries = concepts.stream() + .map(ConceptId::resolve) + .filter(SqlUpdateMatchingStatsJob::isTreeConcept) + .map(TreeConcept.class::cast) + .map(MatchingStatsTask::new) + .map(executors::submit) + .toList(); executors.shutdown(); while (!executors.awaitTermination(1, TimeUnit.MINUTES)) { @@ -140,7 +105,7 @@ else if (allConditionsOneOf(conceptChildren, List.of(EqualCondition.class, Prefi } final long timeElapsed = System.currentTimeMillis() - startTime; - log.info("DONE collecting matching stats. Elapsed time: {} ms. Executed standard queries: {}", timeElapsed, counter.get()); + log.debug("DONE collecting matching stats. Elapsed time: {} ms.", timeElapsed); runningQueries.forEach(SqlUpdateMatchingStatsJob::checkForError); } @@ -159,18 +124,6 @@ private static boolean isTreeConcept(final Concept concept) { return true; } - private boolean allConditionsOneOf(final List children, final List> conditions) { - return children.stream().allMatch(child -> { - if (child.getChildren().isEmpty()) { - return child.getCondition() != null && conditions.stream().anyMatch(condition -> condition.isInstance(child.getCondition())); - } - if (child.getCondition() != null && conditions.stream().noneMatch(condition -> condition.isInstance(child.getCondition()))) { - return false; - } - return allConditionsOneOf(child.getChildren(), conditions); - }); - } - private static void checkForError(final Future future) { try { future.get(); @@ -180,235 +133,105 @@ private static void checkForError(final Future future) { } } - private Stream walkAndCollectMatchingStats(final List connectors, final ConceptTreeNode treeNode) { - return Stream.concat( - treeNode.getChildren().stream().flatMap(child -> walkAndCollectMatchingStats(connectors, child)), - Stream.of(new RegularTask(connectors, (ConceptElement) treeNode)) - ); - } - - private CDateRange toDateRange(final String validityDateExpression) { - final List dateRange = executionService.getResultSetProcessor().getCDateSetParser().toEpochDayRange(validityDateExpression); - return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : null; - } - @RequiredArgsConstructor - private class RegularTask implements Callable { - - private final List connectors; - private final ConceptElement treeNode; - - @Override - public Void call() { - final Optional childCondition = treeNode instanceof ConceptTreeChild treeChild - ? Optional.of(treeChild.getCondition()) - : Optional.empty(); - - final long events = collectEventCount(connectors, childCondition); - final long entities = collectEntityCount(connectors, childCondition); - final CDateRange span = collectDateSpan(connectors, childCondition); - - final SqlMatchingStats matchingStats = new SqlMatchingStats(events, entities, span); - treeNode.setMatchingStats(matchingStats); - counter.incrementAndGet(); - - return null; - } - - /** - * Applies a count(*) on each connector's table, unions these tables and finally calculates the sum() of the count per connector - * to obtain the concept's total event count. - */ - private long collectEventCount(final List connectors, final Optional childCondition) { - - final org.jooq.Table> eventsUnioned = - union(connectors, connector -> createCountEventsQuery(connector, childCondition), Select::unionAll, EVENTS_TABLE); - - final SelectJoinStep> eventsQuery = dslContext.select(sum(eventsUnioned.field(EVENTS, BigDecimal.class)).as(EVENTS)) - .from(eventsUnioned); - - final Result result = executionService.fetch(eventsQuery); - try { - final BigDecimal events = (BigDecimal) result.getValue(0, field(EVENTS)); - return Objects.requireNonNull(events).longValue(); - } - catch (Exception e) { - log.error("Expecting exactly 1 column of numeric type and 1 row in Result when querying for events of a concept node. Error: ", e); - return 0; - } - } - - private SelectConditionStep> createCountEventsQuery(final Connector connector, final Optional childCondition) { - return dslContext.select(count().as(EVENTS)) - .from(table(name(connector.getResolvedTable().getName()))) - .where(toJooqCondition(connector, childCondition)); - } - - /** - * Selects the PIDs for each connector, unions these tables and does a countDistinct(pid) to obtain the concepts total entity count. - */ - private long collectEntityCount(final List connectors, final Optional childCondition) { - - final org.jooq.Table> entitiesUnioned = - union(connectors, connector -> createCountEntitiesQuery(connector, childCondition), Select::union, ENTITIES_TABLE); - - final SelectJoinStep> entitiesQuery = - dslContext.select(countDistinct(entitiesUnioned.field(ENTITIES)).as(ENTITIES)) - .from(entitiesUnioned); - - final Result result = executionService.fetch(entitiesQuery); - try { - // we will get an Integer as SQL return type of SUM select, but MatchingStats expect a long - final Integer value = (Integer) result.getValue(0, field(ENTITIES)); - return Objects.requireNonNull(value).longValue(); - } - catch (Exception e) { - log.error("Expecting exactly 1 column of type Integer and 1 row in Result when querying for events of a concept node. Error: ", e); - return 0; - } - } - - private SelectConditionStep> createCountEntitiesQuery(final Connector connector, final Optional childCondition) { - final Field primaryColumn = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(ENTITIES); - final Table connectorTable = table(name(connector.getResolvedTable().getName())); - final Condition connectorCondition = toJooqCondition(connector, childCondition); - return dslContext.select(primaryColumn) - .from(connectorTable) - .where(connectorCondition); - } - - /** - * For each connector and each of its validity dates, we select the start and end date, union all these tables and select the min(start) and max(end) - * to obtain the concepts total date span. - * - * @return A {@link CDateRange} with the min and max validity date over all the given connectors. Null, if the given connectors have no validity date at all. - */ - private CDateRange collectDateSpan(final List connectors, final Optional childCondition) { - - final Map> validityDateMap = connectors.stream().collect( - // we create all validity dates with the same alias to union them later - Collectors.toMap(Function.identity(), this::createColumnDateRanges) - ); - if (validityDateMap.values().stream().allMatch(List::isEmpty)) { - return null; - } - - final org.jooq.Table validityDatesUnioned = unionAllValidityDates(validityDateMap, childCondition); - // we just need any of the generated column date ranges to get the name of the unioned field(s) - final ColumnDateRange anyOfTheUnionedDates = validityDateMap.get(connectors.get(0)).get(0); - // ensure we have a start and end field (and not a single-column range), because we need to get the min(start) and max(end) - final ColumnDateRange dualColumn = functionProvider.toDualColumn(anyOfTheUnionedDates); - // the get the overall min and max - final ColumnDateRange minAndMax = ColumnDateRange.of(min(dualColumn.getStart()), max(dualColumn.getEnd())); - // finally, we create the proper string expression which handles possible +/-infinity date values - final Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(DATES); - final SelectJoinStep> dateSpanQuery = dslContext.select(validityDateExpression) - .from(validityDatesUnioned); - - final Result result = executionService.fetch(dateSpanQuery); - try { - final String dateExpression = (String) result.getValue(0, field(DATES)); - return toDateRange(dateExpression); - } - catch (Exception e) { - log.error("Expecting exactly 1 column containing a daterange expression when querying for the date span of a concept. Error: ", e); - return null; - } - } - - private List createColumnDateRanges(final Connector connector) { - return connector.getValidityDates().stream() - .map(functionProvider::forValidityDate) - .map(daterange -> daterange.as(DATES.last())) - .toList(); - } - - private org.jooq.Table unionAllValidityDates(final Map> validityDateMap, final Optional childCondition) { - return validityDateMap.entrySet().stream() - .flatMap(entry -> { - Connector connector = entry.getKey(); - List validityDates = entry.getValue(); - return validityDates.stream().map(columnDateRange -> createValidityDateQuery(columnDateRange, connector, childCondition)); - }) - .reduce((validityDate1, validityDate2) -> (SelectConditionStep) validityDate1.unionAll(validityDate2)) - .orElseThrow(() -> new RuntimeException("Expected at least 1 validity date to be present.")) - .asTable(name(VALIDITY_DATES_TABLE)); - } - - private SelectConditionStep createValidityDateQuery( - final ColumnDateRange columnDateRange, - final Connector connector, - final Optional childCondition - ) { - return dslContext.select(columnDateRange.toFields()) - .from(table(name(connector.getResolvedTable().getName()))) - .where(toJooqCondition(connector, childCondition)); - } - - private static org.jooq.Table union( - final Collection input, - final Function> mapper, - final BinaryOperator> operator, - final String tableName - ) { - return input.stream() - .map(mapper) - .reduce(operator) - .orElseThrow(() -> new IllegalStateException("Expected at least one element to union")) - .asTable(name(tableName)); - } - - private Condition toJooqCondition(final Connector connector, final Optional childCondition) { - final CTConditionContext context = CTConditionContext.create(connector, functionProvider); - return childCondition.or(() -> Optional.ofNullable(connector.getCondition())) - .map(condition -> condition.convertToSqlCondition(context).condition()) - .orElse(noCondition()); - } - } - - @RequiredArgsConstructor - private class AllEqualOrPrefixConditionsTask implements Callable { + private class MatchingStatsTask implements Callable { private final TreeConcept treeConcept; @Override public Void call() { + final Map>> relevantColumns = collectRelevantColumns(treeConcept); final Map> validityDateMap = createColumnDateRanges(treeConcept); + + // union of all connectors of the concept final Select unioned = treeConcept.getConnectors().stream() - .map(connector -> this.createConnectorQuery(connector, validityDateMap)) + .map(connector -> this.createConnectorQuery(connector, relevantColumns, validityDateMap)) .reduce(Select::unionAll) .orElseThrow(IllegalStateException::new); + // select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); final Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(DATES); - final Select query = dslContext.select( - field(CONNECTOR_COLUMN), + // all connectors need the same columns originating from the concept definition - they might have different names in the respective connector tables, + // but as we aliased them already, we can just use the unified aliases in the final query + final List> relevantColumnsAliased = relevantColumns.get(treeConcept.getConnectors().get(0)).stream() + .map(field -> field(field.getUnqualifiedName())) + .collect(Collectors.toList()); + + final Select query = dslContext.select(relevantColumnsAliased) + .select( count(asterisk()).as(EVENTS), countDistinct(field(ENTITIES)).as(ENTITIES), validityDateExpression ) .from(unioned) - .groupBy(field(CONNECTOR_COLUMN)); - - final Map groupValueToStats = executionService.fetchStream(query).collect(Collectors.toMap( - record -> record.get(CONNECTOR_COLUMN, String.class), - record -> { - final long events = record.get(EVENTS, Integer.class).longValue(); - final long entities = record.get(ENTITIES, Integer.class).longValue(); - final CDateRange dateSpan = toDateRange(record.get(DATES, String.class)); - return new SqlMatchingStats(events, entities, dateSpan); + .groupBy(relevantColumnsAliased); + + final ConceptTreeCache treeCache = new ConceptTreeCache(treeConcept); + executionService.fetchStream(query).forEach(record -> mapRecordToConceptElements(record, relevantColumnsAliased, treeCache)); + return null; + } + + /** + * @return A map from a connector to all relevant columns the connector's concept defines. A relevant column is any column that is used by a {@link CTCondition} + * which is part of any child of a concept, or it's a concept's connector column. + */ + private Map>> collectRelevantColumns(final TreeConcept treeConcept) { + return treeConcept.getConnectors().stream().collect(Collectors.toMap( + Function.identity(), + connector -> { + try { + return collectRelevantColumns(connector, treeConcept.getChildren()) + .stream() + .map(column -> { + final Field field = field(name(column)); + if (connector.getColumn() != null && connector.getColumn().resolve().getName().equals(column)) { + return field.as(CONNECTOR_COLUMN); + } + // a condition which does not operate on the connector column MUST have the same name in all connector's tables + return field; + }) + .collect(Collectors.toSet()); + } + catch (final ConfigurationException e) { + log.error("Could not calculate matching stats for Concept {} because of a configuration error", treeConcept.getName()); + throw new RuntimeException(e); + } } )); + } - treeConcept.setMatchingStats(groupValueToStats.values().stream().reduce(SqlMatchingStats::add).orElseGet(SqlMatchingStats::empty)); - setAndAggregate(groupValueToStats, treeConcept.getChildren()); + private Set collectRelevantColumns(final Connector connector, final List children) throws ConfigurationException { + final Set relevantColumns = new HashSet<>(); - return null; + for (ConceptTreeChild child : children) { + if (child.getCondition() == null && child.getChildren().isEmpty()) { + continue; + } + + final Set childColumns = new HashSet<>(); + + // Recursively collect columns from the current child's children, if they exist + if (!child.getChildren().isEmpty()) { + final Set childrenColumns = collectRelevantColumns(connector, child.getChildren()); + childColumns.addAll(childrenColumns); + } + + // Add columns from the child's condition, if it exists + if (child.getCondition() != null) { + final Set conditionColumns = child.getCondition().getColumns(connector); + childColumns.addAll(conditionColumns); + } + + relevantColumns.addAll(childColumns); + } + + return relevantColumns; } private Map> createColumnDateRanges(final TreeConcept treeConcept) { @@ -426,86 +249,98 @@ private List createColumnDateRanges(final Connector connector, .toList(); } - private Select createConnectorQuery(final ConceptTreeConnector connector, final Map> validityDateMap) { - + private Select createConnectorQuery( + final ConceptTreeConnector connector, + final Map>> relevantColumns, + final Map> validityDateMap + ) { final Table connectorTable = table(name(connector.getResolvedTable().getName())); - final Field connectorColumn = field(name(connectorTable.getName(), connector.getColumn().resolve().getName())).as(CONNECTOR_COLUMN); + final Set> connectorColumns = relevantColumns.get(connector); final Field primaryKey = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(ENTITIES); // we have to select all possible validity dates of all connectors because we have to union multiple connectors - final Stream> validityDates = - validityDateMap.entrySet().stream().flatMap(entry -> entry.getValue().stream() - .map(columnDateRange -> entry.getKey() == connector + final List> validityDates = + validityDateMap.entrySet().stream() + .flatMap(entry -> entry.getValue().stream().map(columnDateRange -> entry.getKey() == connector ? columnDateRange : functionProvider.nulled(columnDateRange)) - .flatMap(columnDateRange -> columnDateRange.toFields().stream())); + .flatMap(columnDateRange -> columnDateRange.toFields().stream())) + .toList(); - return dslContext.select(Stream.concat(Stream.of(connectorColumn, primaryKey), validityDates).toList()) - .from(connectorTable); + // connector might have a condition + final Condition connectorCondition = toJooqCondition(connector, Optional.ofNullable(connector.getCondition())); + + return dslContext.select(primaryKey) + .select(connectorColumns) + .select(validityDates) + .from(connectorTable) + .where(connectorCondition); } - private void setAndAggregate(final Map groupValueToStats, final List children) { - children.stream().parallel().forEach(child -> { - final SqlMatchingStats nodeStats = new SqlMatchingStats(); - // node is leaf - if (child.getChildren().isEmpty()) { - collectByCondition(groupValueToStats, child, nodeStats); - } - else { - // although node is not a leaf, it can have a condition - if (child.getCondition() != null) { - collectByCondition(groupValueToStats, child, nodeStats); + private Condition toJooqCondition(final Connector connector, final Optional childCondition) { + final CTConditionContext context = CTConditionContext.create(connector, functionProvider); + return childCondition.or(() -> Optional.ofNullable(connector.getCondition())) + .map(condition -> condition.convertToSqlCondition(context).condition()) + .orElse(noCondition()); + } + + private void mapRecordToConceptElements(final Record record, final List> relevantColumns, final ConceptTreeCache treeCache) { + + final CalculatedValue> rowMap = new CalculatedValue<>(record::intoMap); + final MatchingStats.Entry entry = toMatchingStatsEntry(record); + + if (treeConcept.getChildren().isEmpty()) { + addEntryToConceptElement(treeConcept, treeConcept.getName(), entry); + return; + } + + relevantColumns.stream().map(field -> record.get(field, String.class)).forEach(relevantColumnValue -> { + try { + final ConceptTreeChild mostSpecificChild = treeCache.findMostSpecificChild(relevantColumnValue, rowMap); + + // database value did not match any node of the concept + if (mostSpecificChild == null) { + return; + } + + // add stats for most specific child + addEntryToConceptElement(mostSpecificChild, relevantColumnValue, entry); + + // add child stats to all parents till concept root + ConceptTreeNode current = mostSpecificChild.getParent(); + while (current != null) { + addEntryToConceptElement(current, relevantColumnValue, entry); + current = current.getParent(); } - // recursively collect matching stats of children - setAndAggregate(groupValueToStats, child.getChildren()); } - child.setMatchingStats(nodeStats); + catch (ConceptConfigurationException e) { + throw new RuntimeException(e); + } }); } - private static void collectByCondition(final Map groupValueToStats, final ConceptTreeChild node, final SqlMatchingStats nodeStats) { + private MatchingStats.Entry toMatchingStatsEntry(Record record) { + final long events = record.get(EVENTS, Integer.class).longValue(); + final long entities = record.get(ENTITIES, Integer.class).longValue(); + final CDateRange dateSpan = toDateRange(record.get(DATES, String.class)); + return new MatchingStats.Entry(events, entities, dateSpan.getMinValue(), dateSpan.getMaxValue()); + } - // TODO make those methods of CTCondition and call directly on node.condition() - if (node.getCondition() instanceof EqualCondition equalCondition) { - equalCondition.getValues().forEach(val -> { - final SqlMatchingStats statsForCondition = groupValueToStats.getOrDefault(val, SqlMatchingStats.empty()); - nodeStats.add(statsForCondition); - }); - return; - } - else if (node.getCondition() instanceof PrefixCondition prefixCondition) { - Arrays.stream(prefixCondition.getPrefixes()).forEach(prefix -> { - final SqlMatchingStats statsForCondition = groupValueToStats.entrySet().stream() - .filter(entry -> entry.getKey().startsWith(prefix)) - .map(Map.Entry::getValue) - .reduce(SqlMatchingStats::add) - .orElseGet(SqlMatchingStats::empty); - nodeStats.add(statsForCondition); - }); - return; + private CDateRange toDateRange(final String validityDateExpression) { + final List dateRange = executionService.getResultSetProcessor().getCDateSetParser().toEpochDayRange(validityDateExpression); + return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : CDateRange.all(); + } + + private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { + final MatchingStats childMatchingStats; + if (mostSpecificChild.getMatchingStats() == null) { + childMatchingStats = new MatchingStats(); + ((ConceptElement) mostSpecificChild).setMatchingStats(childMatchingStats); } - else if (node.getCondition() instanceof PrefixRangeCondition prefixRangeCondition) { - final SqlMatchingStats statsForCondition = groupValueToStats.entrySet().stream() - .filter(entry -> { - - final String groupValue = entry.getKey(); - final String min = prefixRangeCondition.getMin(); - final String max = prefixRangeCondition.getMax(); - - if (groupValue.length() < min.length()) { - return false; - } - - String pref = groupValue.substring(0, min.length()); - return min.compareTo(pref) <= 0 && max.compareTo(pref) >= 0; - }) - .map(Map.Entry::getValue) - .reduce(SqlMatchingStats::add) - .orElseGet(SqlMatchingStats::empty); - nodeStats.add(statsForCondition); - return; + else { + childMatchingStats = mostSpecificChild.getMatchingStats(); } - throw new IllegalArgumentException("Unsupported condition type: " + node.getCondition().getClass().getSimpleName()); + childMatchingStats.putEntry(columnKey, entry); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java index d3590f389b..50e8adcdc4 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java @@ -1,16 +1,126 @@ package com.bakdata.conquery.models.datasets.concepts; -import javax.annotation.Nullable; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; import com.bakdata.conquery.models.common.daterange.CDateRange; +import com.bakdata.conquery.models.datasets.Column; +import com.bakdata.conquery.models.datasets.Table; +import com.bakdata.conquery.models.events.Bucket; +import com.fasterxml.jackson.annotation.JsonIgnore; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; -public interface MatchingStats { +public class MatchingStats { - long countEvents(); + private final Map entries = new HashMap<>(); - long countEntities(); + @JsonIgnore + private transient CDateRange span; - @Nullable - CDateRange spanEvents(); + @JsonIgnore + private transient long numberOfEvents = -1L; + + @JsonIgnore + private transient long numberOfEntities = -1L; + + public long countEvents() { + if (numberOfEvents == -1L) { + synchronized (this) { + if (numberOfEvents == -1L) { + numberOfEvents = entries.values().stream().mapToLong(MatchingStats.Entry::getNumberOfEvents).sum(); + } + } + } + return numberOfEvents; + } + + + public long countEntities() { + if (numberOfEntities == -1L) { + synchronized (this) { + if (numberOfEntities == -1L) { + numberOfEntities = entries.values().stream().mapToLong(MatchingStats.Entry::getNumberOfEntities).sum(); + } + } + } + return numberOfEntities; + } + + public CDateRange spanEvents() { + if (span == null) { + synchronized (this) { + if (span == null) { + span = entries.values().stream().map(MatchingStats.Entry::getSpan).reduce(CDateRange.all(), CDateRange::spanClosed); + } + } + } + return span; + + } + + public void putEntry(String source, MatchingStats.Entry entry) { + synchronized (this) { + entries.put(source, entry); + span = null; + numberOfEntities = -1L; + numberOfEvents = -1L; + } + } + + @Data + @NoArgsConstructor + @AllArgsConstructor + public static class Entry { + private long numberOfEvents; + + @JsonIgnore + private final Set foundEntities = new HashSet<>(); + private long numberOfEntities; + private int minDate = Integer.MAX_VALUE; + private int maxDate = Integer.MIN_VALUE; + + @JsonIgnore + public CDateRange getSpan() { + if (minDate == Integer.MAX_VALUE && maxDate == Integer.MIN_VALUE) { + return null; + } + + return CDateRange.of( + minDate == Integer.MAX_VALUE ? Integer.MIN_VALUE : minDate, + maxDate == Integer.MIN_VALUE ? Integer.MAX_VALUE : maxDate + ); + } + + public void addEvent(Table table, Bucket bucket, int event, String entityForEvent) { + numberOfEvents++; + if (foundEntities.add(entityForEvent)) { + numberOfEntities++; + } + + for (Column c : table.getColumns()) { + if (!c.getType().isDateCompatible()) { + continue; + } + + if (!bucket.has(event, c)) { + continue; + } + + final CDateRange time = bucket.getAsDateRange(event, c); + + if (time.hasUpperBound()) { + maxDate = Math.max(time.getMaxValue(), maxDate); + } + + if (time.hasLowerBound()) { + minDate = Math.min(time.getMinValue(), minDate); + } + } + } + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java index 86c7dbc772..d846aae539 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java @@ -1,16 +1,20 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import jakarta.validation.Valid; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; +import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; -import jakarta.validation.Valid; -import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; @@ -52,4 +56,13 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { () -> new IllegalStateException("At least one condition is required to convert %s to a SQL condition.".formatted(getClass())) ); } + + @Override + public Set getColumns(Connector connector) throws ConfigurationException { + final Set columns = new HashSet<>(); + for (CTCondition ctCondition : conditions) { + columns.addAll(ctCondition.getColumns(connector)); + } + return columns; + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java index cdcc880ad2..ef2d9ef473 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java @@ -1,10 +1,13 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; import java.util.Map; +import java.util.Set; import com.bakdata.conquery.io.cps.CPSBase; +import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; +import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; @@ -13,14 +16,17 @@ /** * A general condition that serves as a guard for concept tree nodes. */ -@JsonTypeInfo(use=JsonTypeInfo.Id.CUSTOM, property="type") +@JsonTypeInfo(use = JsonTypeInfo.Id.CUSTOM, property = "type") @CPSBase public interface CTCondition { - default void init(ConceptTreeNode node) throws ConceptConfigurationException {} - + default void init(ConceptTreeNode node) throws ConceptConfigurationException { + } + boolean matches(String value, CalculatedValue> rowMap) throws ConceptConfigurationException; WhereCondition convertToSqlCondition(CTConditionContext context); + Set getColumns(final Connector connector) throws ConfigurationException; + } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java index 1b98784e95..1572b40294 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java @@ -2,15 +2,17 @@ import java.util.Map; import java.util.Set; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.MultiSelectCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; import com.bakdata.conquery.util.CollectionsUtil; import com.fasterxml.jackson.annotation.JsonCreator; -import jakarta.validation.constraints.NotEmpty; import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Getter; @@ -21,13 +23,17 @@ /** * This condition requires the value of another column to be equal to a given value. */ -@CPSType(id="COLUMN_EQUAL", base=CTCondition.class) +@CPSType(id = "COLUMN_EQUAL", base = CTCondition.class) @AllArgsConstructor(access = AccessLevel.PRIVATE) public class ColumnEqualCondition implements CTCondition { - @Setter @Getter @NotEmpty + @Setter + @Getter + @NotEmpty private Set values; - @NotEmpty @Setter @Getter + @NotEmpty + @Setter + @Getter private String column; @JsonCreator(mode = JsonCreator.Mode.PROPERTIES) @@ -38,7 +44,7 @@ public static ColumnEqualCondition create(Set values, String column) { @Override public boolean matches(String value, CalculatedValue> rowMap) { Object checkedValue = rowMap.getValue().get(column); - if(checkedValue == null) { + if (checkedValue == null) { return false; } return values.contains(checkedValue.toString()); @@ -49,4 +55,9 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { Field field = DSL.field(DSL.name(context.getConnectorTable().getName(), column), String.class); return new MultiSelectCondition(field, values.toArray(String[]::new), context.getFunctionProvider()); } + + @Override + public Set getColumns(Connector connector) throws ConfigurationException { + return Set.of(column); + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java index 80e3e104a6..10b1a4b55c 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java @@ -2,10 +2,12 @@ import java.util.Map; import java.util.Set; - import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.exceptions.ConfigurationException; +import com.bakdata.conquery.models.identifiable.ids.specific.ColumnId; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.MultiSelectCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; @@ -21,11 +23,13 @@ /** * This condition requires each value to be exactly as given in the list. */ -@CPSType(id="EQUAL", base=CTCondition.class) +@CPSType(id = "EQUAL", base = CTCondition.class) @AllArgsConstructor public class EqualCondition implements CTCondition { - @Setter @Getter @NotEmpty + @Setter + @Getter + @NotEmpty private Set values; @JsonCreator(mode = JsonCreator.Mode.PROPERTIES) @@ -43,4 +47,13 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { Field field = DSL.field(DSL.name(context.getConnectorTable().getName(), context.getConnectorColumn().getName()), String.class); return new MultiSelectCondition(field, values.toArray(String[]::new), context.getFunctionProvider()); } + + @Override + public Set getColumns(final Connector connector) throws ConfigurationException { + final ColumnId column = connector.getColumn(); + if (column == null) { + throw new ConfigurationException("An EQUAL condition requires a connector column"); + } + return Set.of(column.getColumn()); + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java index 6dc5829a95..f216d822ac 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java @@ -2,14 +2,16 @@ import java.time.LocalDate; import java.util.Map; +import java.util.Set; import java.util.stream.Stream; - import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.common.Range; +import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; +import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; @@ -81,6 +83,10 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { throw new UnsupportedOperationException("SQL conversion of CTCondition %s not supported yet.".formatted(getClass())); } + @Override + public Set getColumns(Connector connector) throws ConfigurationException { + return Set.of(); + } public abstract static class ConditionScript extends Script { diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java index 0db1083b1f..0826df77a9 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java @@ -1,8 +1,11 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; import java.util.Map; +import java.util.Set; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; @@ -17,10 +20,11 @@ /** * This condition requires that the selected Column has a value. */ -@CPSType(id="PRESENT", base=CTCondition.class) +@CPSType(id = "PRESENT", base = CTCondition.class) public class IsPresentCondition implements CTCondition { - @Getter @Setter + @Getter + @Setter @NonNull private String column; @@ -34,4 +38,9 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { Condition condition = DSL.field(DSL.name(context.getConnectorTable().getName(), column)).isNotNull(); return new WhereConditionWrapper(condition, ConditionType.PREPROCESSING); } + + @Override + public Set getColumns(Connector connector) throws ConfigurationException { + return Set.of(column); + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java index ac628ab371..6e672d29dc 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java @@ -1,12 +1,14 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; import java.util.Map; - +import java.util.Set; import jakarta.validation.Valid; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; +import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; @@ -16,10 +18,12 @@ /** * This condition matches if its child does not. */ -@CPSType(id="NOT", base=CTCondition.class) +@CPSType(id = "NOT", base = CTCondition.class) public class NotCondition implements CTCondition { - @Setter @Getter @Valid + @Setter + @Getter + @Valid private CTCondition condition; @Override @@ -37,4 +41,9 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { WhereCondition whereCondition = condition.convertToSqlCondition(context); return whereCondition.negate(); } + + @Override + public Set getColumns(Connector connector) throws ConfigurationException { + return condition.getColumns(connector); + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java index 4773fdaf73..a7237f972e 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java @@ -1,14 +1,17 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; +import java.util.HashSet; import java.util.List; import java.util.Map; - +import java.util.Set; import jakarta.validation.Valid; import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; +import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; @@ -53,4 +56,13 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { () -> new IllegalStateException("At least one condition is required to convert %s to a SQL condition.".formatted(getClass())) ); } + + @Override + public Set getColumns(Connector connector) throws ConfigurationException { + final Set columns = new HashSet<>(); + for (CTCondition ctCondition : conditions) { + columns.addAll(ctCondition.getColumns(connector)); + } + return columns; + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java index 0aa21b8cdc..84f53fd8eb 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java @@ -2,15 +2,19 @@ import java.util.Arrays; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.exceptions.ConfigurationException; +import com.bakdata.conquery.models.identifiable.ids.specific.ColumnId; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereConditionWrapper; import com.bakdata.conquery.util.CalculatedValue; -import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; import lombok.ToString; @@ -47,4 +51,13 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { Condition condition = context.getFunctionProvider().likeRegex(field, pattern); return new WhereConditionWrapper(condition, ConditionType.PREPROCESSING); } + + @Override + public Set getColumns(Connector connector) throws ConfigurationException { + final ColumnId column = connector.getColumn(); + if (column == null) { + throw new ConfigurationException("A PREFIX condition requires a connector column"); + } + return Set.of(column.getColumn()); + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java index 48002cc68d..732ef7f239 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java @@ -1,9 +1,13 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; import java.util.Map; +import java.util.Set; import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; +import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.exceptions.ConfigurationException; +import com.bakdata.conquery.models.identifiable.ids.specific.ColumnId; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.dialect.SqlFunctionProvider; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; @@ -61,6 +65,15 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { return new WhereConditionWrapper(regexCondition, ConditionType.PREPROCESSING); } + @Override + public Set getColumns(Connector connector) throws ConfigurationException { + final ColumnId column = connector.getColumn(); + if (column == null) { + throw new ConfigurationException("A PREFIX_RANGE condition requires a connector column"); + } + return Set.of(column.getColumn()); + } + private String buildSqlRegexPattern(SqlFunctionProvider functionProvider) { StringBuilder builder = new StringBuilder(); char[] minChars = min.toCharArray(); diff --git a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateElementMatchingStats.java b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateElementMatchingStats.java index 10ff3a2979..11546e447e 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateElementMatchingStats.java +++ b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateElementMatchingStats.java @@ -5,9 +5,9 @@ import java.util.Map.Entry; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.mode.cluster.WorkerMatchingStats; import com.bakdata.conquery.models.datasets.concepts.Concept; import com.bakdata.conquery.models.datasets.concepts.ConceptElement; +import com.bakdata.conquery.models.datasets.concepts.MatchingStats; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptElementId; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; import com.bakdata.conquery.models.identifiable.ids.specific.WorkerId; @@ -30,14 +30,14 @@ public class UpdateElementMatchingStats extends NamespaceMessage { private final WorkerId source; @ToString.Exclude - private final Map, WorkerMatchingStats.Entry> values; + private final Map, MatchingStats.Entry> values; @Override public void react(DistributedNamespace context) throws Exception { // We collect the concepts outside the loop to update the storage afterward Map> conceptsToUpdate = new HashMap<>(); - for (Entry, WorkerMatchingStats.Entry> entry : values.entrySet()) { + for (Entry, MatchingStats.Entry> entry : values.entrySet()) { try { ConceptElementId element = entry.getKey(); ConceptId conceptId = element.findConcept(); @@ -48,16 +48,16 @@ public void react(DistributedNamespace context) throws Exception { final ConceptElement target = concept.findById(element); - final WorkerMatchingStats.Entry value = entry.getValue(); + final MatchingStats.Entry value = entry.getValue(); conceptsToUpdate.put(conceptId, concept); - WorkerMatchingStats matchingStats = (WorkerMatchingStats) target.getMatchingStats(); + MatchingStats matchingStats = target.getMatchingStats(); if (matchingStats == null) { - matchingStats = new WorkerMatchingStats(); + matchingStats = new MatchingStats(); target.setMatchingStats(matchingStats); } - matchingStats.putEntry(source, value); + matchingStats.putEntry(source.getWorker(), value); } catch (Exception e) { log.error("Failed to set matching stats for '{}' (enable TRACE for exception)", entry.getKey(), (Exception) (log.isTraceEnabled() ? e : null)); diff --git a/backend/src/test/java/com/bakdata/conquery/models/datasets/concepts/tree/MatchingStatsTests.java b/backend/src/test/java/com/bakdata/conquery/models/datasets/concepts/tree/MatchingStatsTests.java index 68e172e5ae..f8e9e4e576 100644 --- a/backend/src/test/java/com/bakdata/conquery/models/datasets/concepts/tree/MatchingStatsTests.java +++ b/backend/src/test/java/com/bakdata/conquery/models/datasets/concepts/tree/MatchingStatsTests.java @@ -2,9 +2,9 @@ import static org.assertj.core.api.Assertions.assertThat; -import com.bakdata.conquery.mode.cluster.WorkerMatchingStats; import com.bakdata.conquery.models.datasets.Column; import com.bakdata.conquery.models.datasets.Table; +import com.bakdata.conquery.models.datasets.concepts.MatchingStats; import com.bakdata.conquery.models.identifiable.ids.specific.DatasetId; import com.bakdata.conquery.models.identifiable.ids.specific.WorkerId; import org.junit.jupiter.api.Test; @@ -17,17 +17,17 @@ public class MatchingStatsTests { @Test public void entitiesCountTest() { - WorkerMatchingStats stats = new WorkerMatchingStats(); + MatchingStats stats = new MatchingStats(); assertThat(stats.countEntities()).isEqualTo(0); - stats.putEntry(workerId1, new WorkerMatchingStats.Entry(5, 5, 10, 20)); + stats.putEntry(workerId1.getWorker(), new MatchingStats.Entry(5, 5, 10, 20)); assertThat(stats.countEntities()).isEqualTo(5); - stats.putEntry(workerId1, new WorkerMatchingStats.Entry(5, 8, 10, 20)); + stats.putEntry(workerId1.getWorker(), new MatchingStats.Entry(5, 8, 10, 20)); assertThat(stats.countEntities()).isEqualTo(8); - stats.putEntry(workerId2, new WorkerMatchingStats.Entry(5, 2, 10, 20)); + stats.putEntry(workerId2.getWorker(), new MatchingStats.Entry(5, 2, 10, 20)); assertThat(stats.countEntities()).isEqualTo(10); @@ -35,7 +35,7 @@ public void entitiesCountTest() { @Test public void addEventTest() { - WorkerMatchingStats stats = new WorkerMatchingStats(); + MatchingStats stats = new MatchingStats(); Table table = new Table(); table.setColumns(new Column[0]); @@ -43,7 +43,7 @@ public void addEventTest() { assertThat(stats.countEntities()).isEqualTo(0); - WorkerMatchingStats.Entry entry1 = new WorkerMatchingStats.Entry(); + MatchingStats.Entry entry1 = new MatchingStats.Entry(); entry1.addEvent(table, null, 1, "1"); entry1.addEvent(table, null, 2, "1"); @@ -57,12 +57,12 @@ public void addEventTest() { entry1.addEvent(table, null, 8, "4"); - stats.putEntry(workerId1, entry1); + stats.putEntry(workerId1.getWorker(), entry1); assertThat(stats.countEvents()).isEqualTo(8); assertThat(stats.countEntities()).isEqualTo(4); - WorkerMatchingStats.Entry entry2 = new WorkerMatchingStats.Entry(); + MatchingStats.Entry entry2 = new MatchingStats.Entry(); entry2.addEvent(table, null, 1, "1"); entry2.addEvent(table, null, 2, "2"); @@ -79,7 +79,7 @@ public void addEventTest() { entry2.addEvent(table, null, 9, "9"); entry2.addEvent(table, null, 10, "10"); - stats.putEntry(workerId2, entry2); + stats.putEntry(workerId2.getWorker(), entry2); assertThat(stats.countEvents()).isEqualTo(18); assertThat(stats.countEntities()).isEqualTo(14); From 9bf3c177da0542d0f9e16cfa26bfa07daadd3676 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 30 Oct 2024 15:53:46 +0100 Subject: [PATCH 14/31] Add missing getter and setter --- .../conquery/models/datasets/concepts/MatchingStats.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java index 50e8adcdc4..fa84124462 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/MatchingStats.java @@ -12,8 +12,12 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import lombok.AllArgsConstructor; import lombok.Data; +import lombok.Getter; import lombok.NoArgsConstructor; +import lombok.Setter; +@Getter +@Setter public class MatchingStats { private final Map entries = new HashMap<>(); From 944aca7aeaadf45535ac36b1f62ce8d3afe3146e Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Thu, 31 Oct 2024 08:42:39 +0100 Subject: [PATCH 15/31] Cleanup --- .../mode/local/SqlUpdateMatchingStatsJob.java | 396 +++++++++--------- .../concepts/conditions/AndCondition.java | 7 +- .../concepts/conditions/CTCondition.java | 3 +- .../conditions/ColumnEqualCondition.java | 5 +- .../concepts/conditions/EqualCondition.java | 12 +- .../concepts/conditions/GroovyCondition.java | 5 +- .../conditions/IsPresentCondition.java | 3 +- .../concepts/conditions/NotCondition.java | 5 +- .../concepts/conditions/OrCondition.java | 7 +- .../concepts/conditions/PrefixCondition.java | 12 +- .../conditions/PrefixRangeCondition.java | 12 +- .../models/worker/LocalNamespace.java | 2 +- .../conquery/models/worker/Namespace.java | 2 +- .../execution/DefaultResultSetProcessor.java | 7 +- .../tests/MetadataCollectionTest.java | 22 +- .../src/test/resources/shared/icd.test.json | 210 ---------- 16 files changed, 236 insertions(+), 474 deletions(-) delete mode 100644 backend/src/test/resources/shared/icd.test.json diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java index 3b532ee164..13ec43196e 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java @@ -1,6 +1,14 @@ package com.bakdata.conquery.mode.local; -import static org.jooq.impl.DSL.*; +import static org.jooq.impl.DSL.asterisk; +import static org.jooq.impl.DSL.count; +import static org.jooq.impl.DSL.countDistinct; +import static org.jooq.impl.DSL.field; +import static org.jooq.impl.DSL.max; +import static org.jooq.impl.DSL.min; +import static org.jooq.impl.DSL.name; +import static org.jooq.impl.DSL.noCondition; +import static org.jooq.impl.DSL.table; import java.sql.Date; import java.util.HashSet; @@ -8,7 +16,6 @@ import java.util.Map; import java.util.Optional; import java.util.Set; -import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -30,7 +37,6 @@ import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; -import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; import com.bakdata.conquery.models.jobs.Job; import com.bakdata.conquery.sql.conversion.SharedAliases; @@ -40,8 +46,8 @@ import com.bakdata.conquery.sql.execution.SqlExecutionService; import com.bakdata.conquery.util.CalculatedValue; import com.bakdata.conquery.util.TablePrimaryColumnUtil; -import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.time.StopWatch; import org.jooq.Condition; import org.jooq.DSLContext; import org.jooq.Field; @@ -90,22 +96,24 @@ public void execute() throws Exception { log.debug("BEGIN update Matching stats for {} Concepts.", concepts.size()); - final long startTime = System.currentTimeMillis(); - final List> runningQueries = concepts.stream() - .map(ConceptId::resolve) - .filter(SqlUpdateMatchingStatsJob::isTreeConcept) - .map(TreeConcept.class::cast) - .map(MatchingStatsTask::new) - .map(executors::submit) - .toList(); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + + final List> runningQueries = concepts.stream() + .map(ConceptId::resolve) + .filter(SqlUpdateMatchingStatsJob::isTreeConcept) + .map(TreeConcept.class::cast) + .map(treeConcept -> (Runnable) () -> calculateMatchingStats(treeConcept)) + .map(executors::submit) + .collect(Collectors.toList()); executors.shutdown(); while (!executors.awaitTermination(1, TimeUnit.MINUTES)) { log.debug("Waiting for executors to set matching stats for all concepts..."); } - final long timeElapsed = System.currentTimeMillis() - startTime; - log.debug("DONE collecting matching stats. Elapsed time: {} ms.", timeElapsed); + stopWatch.stop(); + log.debug("DONE collecting matching stats. Elapsed time: {} ms.", stopWatch.getTime()); runningQueries.forEach(SqlUpdateMatchingStatsJob::checkForError); } @@ -124,7 +132,7 @@ private static boolean isTreeConcept(final Concept concept) { return true; } - private static void checkForError(final Future future) { + private static void checkForError(final Future future) { try { future.get(); } @@ -133,216 +141,204 @@ private static void checkForError(final Future future) { } } - @RequiredArgsConstructor - private class MatchingStatsTask implements Callable { - - private final TreeConcept treeConcept; - - @Override - public Void call() { - - final Map>> relevantColumns = collectRelevantColumns(treeConcept); - final Map> validityDateMap = createColumnDateRanges(treeConcept); - - // union of all connectors of the concept - final Select unioned = treeConcept.getConnectors().stream() - .map(connector -> this.createConnectorQuery(connector, relevantColumns, validityDateMap)) - .reduce(Select::unionAll) - .orElseThrow(IllegalStateException::new); - - // select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors - final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); - final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); - final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); - final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); - final Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(DATES); - - // all connectors need the same columns originating from the concept definition - they might have different names in the respective connector tables, - // but as we aliased them already, we can just use the unified aliases in the final query - final List> relevantColumnsAliased = relevantColumns.get(treeConcept.getConnectors().get(0)).stream() - .map(field -> field(field.getUnqualifiedName())) - .collect(Collectors.toList()); - - final Select query = dslContext.select(relevantColumnsAliased) - .select( - count(asterisk()).as(EVENTS), - countDistinct(field(ENTITIES)).as(ENTITIES), - validityDateExpression - ) - .from(unioned) - .groupBy(relevantColumnsAliased); - - final ConceptTreeCache treeCache = new ConceptTreeCache(treeConcept); - executionService.fetchStream(query).forEach(record -> mapRecordToConceptElements(record, relevantColumnsAliased, treeCache)); - return null; - } - - /** - * @return A map from a connector to all relevant columns the connector's concept defines. A relevant column is any column that is used by a {@link CTCondition} - * which is part of any child of a concept, or it's a concept's connector column. - */ - private Map>> collectRelevantColumns(final TreeConcept treeConcept) { - return treeConcept.getConnectors().stream().collect(Collectors.toMap( - Function.identity(), - connector -> { - try { - return collectRelevantColumns(connector, treeConcept.getChildren()) - .stream() - .map(column -> { - final Field field = field(name(column)); - if (connector.getColumn() != null && connector.getColumn().resolve().getName().equals(column)) { - return field.as(CONNECTOR_COLUMN); - } - // a condition which does not operate on the connector column MUST have the same name in all connector's tables - return field; - }) - .collect(Collectors.toSet()); - } - catch (final ConfigurationException e) { - log.error("Could not calculate matching stats for Concept {} because of a configuration error", treeConcept.getName()); - throw new RuntimeException(e); - } - } - )); - } + public void calculateMatchingStats(final TreeConcept treeConcept) { + + final Map>> relevantColumns = collectRelevantColumns(treeConcept); + final Map> validityDateMap = createColumnDateRanges(treeConcept); + + // union of all connectors of the concept + final Select unioned = treeConcept.getConnectors().stream() + .map(connector -> this.createConnectorQuery(connector, relevantColumns, validityDateMap)) + .reduce(Select::unionAll) + .orElseThrow(IllegalStateException::new); + + // select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors + final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); + final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); + final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); + final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); + final Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(DATES); + + // all connectors need the same columns originating from the concept definition - they might have different names in the respective connector tables, + // but as we aliased them already, we can just use the unified aliases in the final query + final List> relevantColumnsAliased = relevantColumns.get(treeConcept.getConnectors().get(0)).stream() + .map(field -> field(field.getUnqualifiedName())) + .collect(Collectors.toList()); + + final Select query = dslContext.select(relevantColumnsAliased) + .select( + count(asterisk()).as(EVENTS), + countDistinct(field(ENTITIES)).as(ENTITIES), + validityDateExpression + ) + .from(unioned) + .groupBy(relevantColumnsAliased); + + final ConceptTreeCache treeCache = new ConceptTreeCache(treeConcept); + executionService.fetchStream(query).forEach(record -> mapRecordToConceptElements(treeConcept, record, relevantColumnsAliased, treeCache)); + } - private Set collectRelevantColumns(final Connector connector, final List children) throws ConfigurationException { - final Set relevantColumns = new HashSet<>(); + /** + * @return A map from a connector to all relevant columns the connector's concept defines. A relevant column is any column that is used by a + * {@link CTCondition} which is part of any child of a concept, or it's a concept's connector column. + */ + private Map>> collectRelevantColumns(final TreeConcept treeConcept) { + return treeConcept.getConnectors().stream().collect(Collectors.toMap( + Function.identity(), + connector -> collectRelevantColumns(connector, treeConcept.getChildren()) + .stream() + .map(column -> { + final Field field = field(name(column)); + // connector columns are unioned, thus they need the same alias + if (connector.getColumn() != null && connector.getColumn().resolve().getName().equals(column)) { + return field.as(CONNECTOR_COLUMN); + } + // a condition which does not operate on the connector column MUST have the same name in all connector's tables + return field; + }) + .collect(Collectors.toSet()) + )); + } - for (ConceptTreeChild child : children) { - if (child.getCondition() == null && child.getChildren().isEmpty()) { - continue; - } + private Set collectRelevantColumns(final Connector connector, final List children) { + final Set relevantColumns = new HashSet<>(); - final Set childColumns = new HashSet<>(); + for (ConceptTreeChild child : children) { + if (child.getCondition() == null && child.getChildren().isEmpty()) { + continue; + } - // Recursively collect columns from the current child's children, if they exist - if (!child.getChildren().isEmpty()) { - final Set childrenColumns = collectRelevantColumns(connector, child.getChildren()); - childColumns.addAll(childrenColumns); - } + final Set childColumns = new HashSet<>(); - // Add columns from the child's condition, if it exists - if (child.getCondition() != null) { - final Set conditionColumns = child.getCondition().getColumns(connector); - childColumns.addAll(conditionColumns); - } + // Recursively collect columns from the current child's children, if they exist + if (!child.getChildren().isEmpty()) { + final Set childrenColumns = collectRelevantColumns(connector, child.getChildren()); + childColumns.addAll(childrenColumns); + } - relevantColumns.addAll(childColumns); + // Add columns from the child's condition, if it exists + if (child.getCondition() != null) { + final Set conditionColumns = child.getCondition().getColumns(connector); + childColumns.addAll(conditionColumns); } - return relevantColumns; + relevantColumns.addAll(childColumns); } - private Map> createColumnDateRanges(final TreeConcept treeConcept) { - final AtomicInteger counter = new AtomicInteger(0); - return treeConcept.getConnectors().stream().collect(Collectors.toMap( - Function.identity(), - connector -> createColumnDateRanges(connector, counter) - )); - } + return relevantColumns; + } - private List createColumnDateRanges(final Connector connector, final AtomicInteger counter) { - return connector.getValidityDates().stream() - .map(functionProvider::forValidityDate) - .map(daterange -> daterange.as("%s-%d".formatted(SharedAliases.DATES_COLUMN.getAlias(), counter.incrementAndGet()))) - .toList(); - } + private Map> createColumnDateRanges(final TreeConcept treeConcept) { + final AtomicInteger counter = new AtomicInteger(0); + return treeConcept.getConnectors().stream().collect(Collectors.toMap( + Function.identity(), + connector -> createColumnDateRanges(connector, counter) + )); + } - private Select createConnectorQuery( - final ConceptTreeConnector connector, - final Map>> relevantColumns, - final Map> validityDateMap - ) { - final Table connectorTable = table(name(connector.getResolvedTable().getName())); - final Set> connectorColumns = relevantColumns.get(connector); - final Field primaryKey = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(ENTITIES); - - // we have to select all possible validity dates of all connectors because we have to union multiple connectors - final List> validityDates = - validityDateMap.entrySet().stream() - .flatMap(entry -> entry.getValue().stream().map(columnDateRange -> entry.getKey() == connector - ? columnDateRange - : functionProvider.nulled(columnDateRange)) - .flatMap(columnDateRange -> columnDateRange.toFields().stream())) - .toList(); - - // connector might have a condition - final Condition connectorCondition = toJooqCondition(connector, Optional.ofNullable(connector.getCondition())); - - return dslContext.select(primaryKey) - .select(connectorColumns) - .select(validityDates) - .from(connectorTable) - .where(connectorCondition); - } + private List createColumnDateRanges(final Connector connector, final AtomicInteger counter) { + return connector.getValidityDates().stream() + .map(functionProvider::forValidityDate) + .map(daterange -> daterange.as("%s-%d".formatted(SharedAliases.DATES_COLUMN.getAlias(), counter.incrementAndGet()))) + .toList(); + } - private Condition toJooqCondition(final Connector connector, final Optional childCondition) { - final CTConditionContext context = CTConditionContext.create(connector, functionProvider); - return childCondition.or(() -> Optional.ofNullable(connector.getCondition())) - .map(condition -> condition.convertToSqlCondition(context).condition()) - .orElse(noCondition()); - } + private Select createConnectorQuery( + final ConceptTreeConnector connector, + final Map>> relevantColumns, + final Map> validityDateMap + ) { + final Table connectorTable = table(name(connector.getResolvedTable().getName())); + final Set> connectorColumns = relevantColumns.get(connector); + final Field primaryKey = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(ENTITIES); + + // we have to select all possible validity dates of all connectors because we have to union multiple connectors + final List> validityDates = + validityDateMap.entrySet().stream() + .flatMap(entry -> entry.getValue().stream().map(columnDateRange -> entry.getKey() == connector + ? columnDateRange + : functionProvider.nulled(columnDateRange)) + .flatMap(columnDateRange -> columnDateRange.toFields().stream())) + .toList(); + + // connector might have a condition + final Condition connectorCondition = toJooqCondition(connector, Optional.ofNullable(connector.getCondition())); + + return dslContext.select(primaryKey) + .select(connectorColumns) + .select(validityDates) + .from(connectorTable) + .where(connectorCondition); + } - private void mapRecordToConceptElements(final Record record, final List> relevantColumns, final ConceptTreeCache treeCache) { + private Condition toJooqCondition(final Connector connector, final Optional childCondition) { + final CTConditionContext context = CTConditionContext.create(connector, functionProvider); + return childCondition.or(() -> Optional.ofNullable(connector.getCondition())) + .map(condition -> condition.convertToSqlCondition(context).condition()) + .orElse(noCondition()); + } - final CalculatedValue> rowMap = new CalculatedValue<>(record::intoMap); - final MatchingStats.Entry entry = toMatchingStatsEntry(record); + private void mapRecordToConceptElements( + final TreeConcept treeConcept, + final Record record, + final List> relevantColumns, + final ConceptTreeCache treeCache + ) { + final CalculatedValue> rowMap = new CalculatedValue<>(record::intoMap); + final MatchingStats.Entry entry = toMatchingStatsEntry(record); - if (treeConcept.getChildren().isEmpty()) { - addEntryToConceptElement(treeConcept, treeConcept.getName(), entry); - return; - } + if (treeConcept.getChildren().isEmpty()) { + addEntryToConceptElement(treeConcept, treeConcept.getName(), entry); + return; + } - relevantColumns.stream().map(field -> record.get(field, String.class)).forEach(relevantColumnValue -> { - try { - final ConceptTreeChild mostSpecificChild = treeCache.findMostSpecificChild(relevantColumnValue, rowMap); + relevantColumns.stream().map(field -> record.get(field, String.class)).forEach(relevantColumnValue -> { + try { + final ConceptTreeChild mostSpecificChild = treeCache.findMostSpecificChild(relevantColumnValue, rowMap); - // database value did not match any node of the concept - if (mostSpecificChild == null) { - return; - } + // database value did not match any node of the concept + if (mostSpecificChild == null) { + return; + } - // add stats for most specific child - addEntryToConceptElement(mostSpecificChild, relevantColumnValue, entry); + // add stats for most specific child + addEntryToConceptElement(mostSpecificChild, relevantColumnValue, entry); - // add child stats to all parents till concept root - ConceptTreeNode current = mostSpecificChild.getParent(); - while (current != null) { - addEntryToConceptElement(current, relevantColumnValue, entry); - current = current.getParent(); - } - } - catch (ConceptConfigurationException e) { - throw new RuntimeException(e); + // add child stats to all parents till concept root + ConceptTreeNode current = mostSpecificChild.getParent(); + while (current != null) { + addEntryToConceptElement(current, relevantColumnValue, entry); + current = current.getParent(); } - }); - } + } + catch (ConceptConfigurationException e) { + throw new RuntimeException(e); + } + }); + } - private MatchingStats.Entry toMatchingStatsEntry(Record record) { - final long events = record.get(EVENTS, Integer.class).longValue(); - final long entities = record.get(ENTITIES, Integer.class).longValue(); - final CDateRange dateSpan = toDateRange(record.get(DATES, String.class)); - return new MatchingStats.Entry(events, entities, dateSpan.getMinValue(), dateSpan.getMaxValue()); - } + private MatchingStats.Entry toMatchingStatsEntry(Record record) { + final long events = record.get(EVENTS, Integer.class).longValue(); + final long entities = record.get(ENTITIES, Integer.class).longValue(); + final CDateRange dateSpan = toDateRange(record.get(DATES, String.class)); + return new MatchingStats.Entry(events, entities, dateSpan.getMinValue(), dateSpan.getMaxValue()); + } - private CDateRange toDateRange(final String validityDateExpression) { - final List dateRange = executionService.getResultSetProcessor().getCDateSetParser().toEpochDayRange(validityDateExpression); - return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : CDateRange.all(); - } + private CDateRange toDateRange(final String validityDateExpression) { + final List dateRange = executionService.getResultSetProcessor().getCDateSetParser().toEpochDayRange(validityDateExpression); + return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : CDateRange.all(); + } - private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { - final MatchingStats childMatchingStats; - if (mostSpecificChild.getMatchingStats() == null) { - childMatchingStats = new MatchingStats(); - ((ConceptElement) mostSpecificChild).setMatchingStats(childMatchingStats); - } - else { - childMatchingStats = mostSpecificChild.getMatchingStats(); - } - childMatchingStats.putEntry(columnKey, entry); + private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { + final MatchingStats childMatchingStats; + if (mostSpecificChild.getMatchingStats() == null) { + childMatchingStats = new MatchingStats(); + ((ConceptElement) mostSpecificChild).setMatchingStats(childMatchingStats); } - + else { + childMatchingStats = mostSpecificChild.getMatchingStats(); + } + childMatchingStats.putEntry(columnKey, entry); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java index d846aae539..18d56960fe 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java @@ -4,17 +4,16 @@ import java.util.List; import java.util.Map; import java.util.Set; -import jakarta.validation.Valid; -import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; -import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; +import jakarta.validation.Valid; +import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; @@ -58,7 +57,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) throws ConfigurationException { + public Set getColumns(Connector connector) { final Set columns = new HashSet<>(); for (CTCondition ctCondition : conditions) { columns.addAll(ctCondition.getColumns(connector)); diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java index ef2d9ef473..2098e2995f 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java @@ -7,7 +7,6 @@ import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; -import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; @@ -27,6 +26,6 @@ default void init(ConceptTreeNode node) throws ConceptConfigurationException { WhereCondition convertToSqlCondition(CTConditionContext context); - Set getColumns(final Connector connector) throws ConfigurationException; + Set getColumns(Connector connector); } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java index 1572b40294..438dc1a046 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java @@ -2,17 +2,16 @@ import java.util.Map; import java.util.Set; -import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.datasets.concepts.Connector; -import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.MultiSelectCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; import com.bakdata.conquery.util.CollectionsUtil; import com.fasterxml.jackson.annotation.JsonCreator; +import jakarta.validation.constraints.NotEmpty; import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Getter; @@ -57,7 +56,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) throws ConfigurationException { + public Set getColumns(Connector connector) { return Set.of(column); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java index 10b1a4b55c..3942231400 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java @@ -2,18 +2,16 @@ import java.util.Map; import java.util.Set; -import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.datasets.concepts.Connector; -import com.bakdata.conquery.models.exceptions.ConfigurationException; -import com.bakdata.conquery.models.identifiable.ids.specific.ColumnId; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.MultiSelectCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; import com.bakdata.conquery.util.CollectionsUtil; import com.fasterxml.jackson.annotation.JsonCreator; +import jakarta.validation.constraints.NotEmpty; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.Setter; @@ -49,11 +47,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(final Connector connector) throws ConfigurationException { - final ColumnId column = connector.getColumn(); - if (column == null) { - throw new ConfigurationException("An EQUAL condition requires a connector column"); - } - return Set.of(column.getColumn()); + public Set getColumns(Connector connector) { + return Set.of(connector.getColumn().getColumn()); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java index f216d822ac..2d314eb032 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java @@ -4,20 +4,19 @@ import java.util.Map; import java.util.Set; import java.util.stream.Stream; -import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.common.Range; import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; -import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; import com.fasterxml.jackson.annotation.JsonIgnore; import groovy.lang.GroovyShell; import groovy.lang.Script; +import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; import lombok.extern.slf4j.Slf4j; @@ -84,7 +83,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) throws ConfigurationException { + public Set getColumns(Connector connector) { return Set.of(); } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java index 0826df77a9..bef9540555 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java @@ -5,7 +5,6 @@ import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.datasets.concepts.Connector; -import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; @@ -40,7 +39,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) throws ConfigurationException { + public Set getColumns(Connector connector) { return Set.of(column); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java index 6e672d29dc..869be30283 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java @@ -2,16 +2,15 @@ import java.util.Map; import java.util.Set; -import jakarta.validation.Valid; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; -import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; +import jakarta.validation.Valid; import lombok.Getter; import lombok.Setter; @@ -43,7 +42,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) throws ConfigurationException { + public Set getColumns(Connector connector) { return condition.getColumns(connector); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java index a7237f972e..270193e2e2 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java @@ -4,17 +4,16 @@ import java.util.List; import java.util.Map; import java.util.Set; -import jakarta.validation.Valid; -import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; -import com.bakdata.conquery.models.exceptions.ConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; +import jakarta.validation.Valid; +import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; @@ -58,7 +57,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) throws ConfigurationException { + public Set getColumns(Connector connector) { final Set columns = new HashSet<>(); for (CTCondition ctCondition : conditions) { columns.addAll(ctCondition.getColumns(connector)); diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java index 84f53fd8eb..7ad362505f 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java @@ -4,17 +4,15 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.datasets.concepts.Connector; -import com.bakdata.conquery.models.exceptions.ConfigurationException; -import com.bakdata.conquery.models.identifiable.ids.specific.ColumnId; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereConditionWrapper; import com.bakdata.conquery.util.CalculatedValue; +import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; import lombok.ToString; @@ -53,11 +51,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) throws ConfigurationException { - final ColumnId column = connector.getColumn(); - if (column == null) { - throw new ConfigurationException("A PREFIX condition requires a connector column"); - } - return Set.of(column.getColumn()); + public Set getColumns(Connector connector) { + return Set.of(connector.getColumn().getColumn()); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java index 732ef7f239..75143b6436 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java @@ -2,12 +2,9 @@ import java.util.Map; import java.util.Set; -import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.datasets.concepts.Connector; -import com.bakdata.conquery.models.exceptions.ConfigurationException; -import com.bakdata.conquery.models.identifiable.ids.specific.ColumnId; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.dialect.SqlFunctionProvider; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; @@ -16,6 +13,7 @@ import com.bakdata.conquery.util.CalculatedValue; import com.fasterxml.jackson.annotation.JsonIgnore; import io.dropwizard.validation.ValidationMethod; +import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; import org.jooq.Condition; @@ -66,12 +64,8 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) throws ConfigurationException { - final ColumnId column = connector.getColumn(); - if (column == null) { - throw new ConfigurationException("A PREFIX_RANGE condition requires a connector column"); - } - return Set.of(column.getColumn()); + public Set getColumns(Connector connector) { + return Set.of(connector.getColumn().getColumn()); } private String buildSqlRegexPattern(SqlFunctionProvider functionProvider) { diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java index 3dbdf83adb..383fe5f4a4 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java @@ -65,7 +65,7 @@ public LocalNamespace( @Override void updateMatchingStats() { - final Set concepts = collectConcepts(); + final Set concepts = collectConceptsWithoutMatchingStats(); ExecutorService executorService = Executors.newFixedThreadPool(sqlConnectorConfig.getBackgroundThreads()); Job job = new SqlUpdateMatchingStatsJob( databaseConfig, diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java index 7f112044de..9821f5303f 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java @@ -142,7 +142,7 @@ public void postprocessData() { } - protected Set collectConcepts() { + protected Set collectConceptsWithoutMatchingStats() { return getStorage().getAllConcepts() .filter(concept -> concept.getMatchingStats() == null) .map(Concept::getId) diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java index b65541af85..531bd7fbe5 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java @@ -11,13 +11,14 @@ import com.bakdata.conquery.models.config.ConqueryConfig; import com.bakdata.conquery.util.DateReader; +import lombok.Getter; import lombok.RequiredArgsConstructor; @RequiredArgsConstructor class DefaultResultSetProcessor implements ResultSetProcessor { private final ConqueryConfig config; - @lombok.Getter + @Getter private final SqlCDateSetParser cDateSetParser; @Override @@ -120,7 +121,7 @@ private List fromString(ResultSet resultSet, int columnIndex, Function * For example, calling a primitives' ResultSet getter like getDouble, getInt etc. straightaway will never return null. */ - private static T checkForNullElseGet(ResultSet resultSet, int columnIndex, Getter getter, Class resultType) throws SQLException { + private static T checkForNullElseGet(ResultSet resultSet, int columnIndex, GetMethod getter, Class resultType) throws SQLException { if (resultSet.getObject(columnIndex) == null) { return null; diff --git a/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java b/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java index c6573cb181..353fd2d31c 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/tests/MetadataCollectionTest.java @@ -3,17 +3,15 @@ import static org.assertj.core.api.Assertions.assertThat; import java.time.LocalDate; +import java.util.Set; import com.bakdata.conquery.integration.IntegrationTest; import com.bakdata.conquery.integration.json.ConqueryTestSpec; import com.bakdata.conquery.integration.json.JsonIntegrationTest; import com.bakdata.conquery.models.common.daterange.CDateRange; -import com.bakdata.conquery.models.datasets.concepts.Concept; import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; import com.bakdata.conquery.models.exceptions.ValidatorHelper; import com.bakdata.conquery.models.identifiable.ids.specific.DatasetId; -import com.bakdata.conquery.models.messages.namespaces.specific.UpdateMatchingStatsMessage; -import com.bakdata.conquery.models.worker.DistributedNamespace; import com.bakdata.conquery.util.support.StandaloneSupport; import com.github.powerlibraries.io.In; import lombok.extern.slf4j.Slf4j; @@ -21,6 +19,11 @@ @Slf4j public class MetadataCollectionTest extends IntegrationTest.Simple implements ProgrammaticIntegrationTest { + @Override + public Set forModes() { + return Set.of(StandaloneSupport.Mode.SQL, StandaloneSupport.Mode.WORKER); + } + @Override public void execute(StandaloneSupport conquery) throws Exception { //read test sepcification @@ -34,10 +37,7 @@ public void execute(StandaloneSupport conquery) throws Exception { test.importRequiredData(conquery); //ensure the metadata is collected - DistributedNamespace namespace = (DistributedNamespace) conquery.getNamespace(); - namespace.getWorkerHandler() - .sendToAll(new UpdateMatchingStatsMessage(conquery.getNamespace().getStorage().getAllConcepts().map(Concept::getId).toList())); - + conquery.getNamespace().postprocessData(); conquery.waitUntilWorkDone(); TreeConcept concept = (TreeConcept) conquery.getNamespace().getStorage().getAllConcepts().iterator().next(); @@ -47,13 +47,13 @@ public void execute(StandaloneSupport conquery) throws Exception { assertThat(concept.getChildren()).allSatisfy(c -> { assertThat(c.getMatchingStats().countEvents()).isEqualTo(2); }); - + //check the date ranges assertThat(concept.getMatchingStats().spanEvents()) - .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-15"), LocalDate.parse("2013-11-10"))); + .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-15"), LocalDate.parse("2013-11-10"))); assertThat(concept.getChildren().get(0).getMatchingStats().spanEvents()) - .isEqualTo(CDateRange.of(LocalDate.parse("2012-01-01"), LocalDate.parse("2013-11-10"))); + .isEqualTo(CDateRange.of(LocalDate.parse("2012-01-01"), LocalDate.parse("2013-11-10"))); assertThat(concept.getChildren().get(1).getMatchingStats().spanEvents()) - .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-15"), LocalDate.parse("2012-11-11"))); + .isEqualTo(CDateRange.of(LocalDate.parse("2010-07-15"), LocalDate.parse("2012-11-11"))); } } diff --git a/backend/src/test/resources/shared/icd.test.json b/backend/src/test/resources/shared/icd.test.json deleted file mode 100644 index e6b97db334..0000000000 --- a/backend/src/test/resources/shared/icd.test.json +++ /dev/null @@ -1,210 +0,0 @@ -{ - "type": "QUERY_TEST", - "label": "COMMON_CONCEPT_ICD_QUERY Test", - "expectedCsv": "", - "query": { - "type": "CONCEPT_QUERY", - "root": { - "type": "AND", - "children": [ - { - "type": "DATE_RESTRICTION", - "dateRange": { - "min": "2017-01-01", - "max": "2017-12-31" - }, - "child": { - "type": "CONCEPT", - "ids": [ - "icd.f00$2df99.f20$2df29.f20" - ], - "label": "F20", - "tables": [ - { - "id": "icd.kh_diagnose_icd_code", - "filters": [] - } - ] - } - } - ] - } - }, - "concepts": [ - { - "label": "ICD", - "type": "TREE", - "additionalInfos": [ - { - "key": "ICD-Codes", - "value": "Historisierung bis einschließlich des Jahres 2018" - } - ], - "connectors": [ - { - "label": "KH-Diagnose", - "name": "kh_diagnose_icd_code", - "column": "kh_diagnose.icd_code", - "condition": { - "type": "PREFIX_RANGE", - "min": "E", - "max": "F" - }, - "validityDates": [ - { - "label": "Aufnahmedatum", - "column": "kh_diagnose.aufnahmedatum" - }, - { - "label": "Entlassungsdatum", - "column": "kh_diagnose.entlassungsdatum" - } - ], - "filters": [] - } - ], - "children": [ - { - "label": "F00-F99", - "description": "Psychische und Verhaltensstörungen", - "condition": { - "type": "PREFIX_RANGE", - "min": "F00", - "max": "F99" - }, - "children": [ - { - "label": "F20-F29", - "description": "Schizophrenie, schizotype und wahnhafte Störungen", - "condition": { - "type": "PREFIX_RANGE", - "min": "F20", - "max": "F29" - }, - "children": [ - { - "label": "F20", - "description": "Schizophrenie", - "condition": { - "type": "PREFIX_LIST", - "prefixes": [ - "F20" - ] - }, - "children": [ - { - "label": "F20.0", - "description": "Paranoide Schizophrenie", - "additionalInfos": [ - { - "key": "Stichworte", - "value": "Paranoide Schizophrenie -- Paranoid-halluzinatorische Schizophrenie -- Paranoide Schizophrenie mit Halluzination -- Paraphrenie -- Paranoid-schizophrene Psychose -- Akute Paraphrenie -- Paraphrene Schizophrenie -- Akute paranoide Schizophrenie" - } - ], - "condition": { - "type": "PREFIX_LIST", - "prefixes": [ - "F200" - ] - } - }, - { - "label": "F20.1", - "description": "Hebephrene Schizophrenie", - "additionalInfos": [ - { - "key": "Stichworte", - "value": "Hebephrenie -- Hebephrene Schizophrenie -- Akute Hebephrenie -- Hebephrene Demenz -- Hebephrene Dementia praecox -- Desintegrative Schizophrenie -- Desorganisierte Schizophrenie -- Jugendirresein" - } - ], - "condition": { - "type": "PREFIX_LIST", - "prefixes": [ - "F201" - ] - } - }, - { - "label": "F20.4", - "description": "Postschizophrene Depression", - "additionalInfos": [ - { - "key": "Stichworte", - "value": "Postschizophrene Depression" - } - ], - "condition": { - "type": "PREFIX_LIST", - "prefixes": [ - "F204" - ] - } - }, - { - "label": "F20.5", - "description": "Schizophrenes Residuum", - "additionalInfos": [ - { - "key": "Stichworte", - "value": "Schizophrenes Residuum -- Schizophrener Restzustand -- Chronischer Morbus Bleuler -- Schizophrener Defekt -- Chronische Schizophrenie a.n.k. -- Residuale Schizophrenie -- Schizophrener Residualzustand -- Chronische undifferenzierte Schizophrenie" - } - ], - "condition": { - "type": "PREFIX_LIST", - "prefixes": [ - "F205" - ] - } - }, - { - "label": "F20.6", - "description": "Schizophrenia simplex", - "additionalInfos": [ - { - "key": "Stichworte", - "value": "Schizophrenia simplex -- Akute primäre Schizophrenie -- Akute einfache Schizophrenie" - } - ], - "condition": { - "type": "PREFIX_LIST", - "prefixes": [ - "F206" - ] - } - } - ] - } - ] - } - ] - } - ] - } - ], - "content": { - "tables": [ - { - "csv": "/shared/kh-content.csv", - "name": "kh_diagnose", - "primaryColumn": { - "name": "primary_column", - "type": "STRING" - }, - "columns": [ - { - "name": "icd_code", - "type": "STRING" - }, - { - "name": "aufnahmedatum", - "type": "DATE" - }, - { - "name": "entlassungsdatum", - "type": "DATE" - } - ] - } - ] - } -} From 7e60026e125c2f6f158e4f58f0bd5b91108f7cdf Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Thu, 31 Oct 2024 08:55:09 +0100 Subject: [PATCH 16/31] Remove kh-content.csv --- backend/src/test/resources/shared/kh-content.csv | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 backend/src/test/resources/shared/kh-content.csv diff --git a/backend/src/test/resources/shared/kh-content.csv b/backend/src/test/resources/shared/kh-content.csv deleted file mode 100644 index 679471fb56..0000000000 --- a/backend/src/test/resources/shared/kh-content.csv +++ /dev/null @@ -1,11 +0,0 @@ -primary_column,icd_code,aufnahmedatum,entlassungsdatum -3,"F200",2022-11-28,2022-11-11 -3,"F201",2021-08-31,2021-12-15 -3,"F204",2010-07-01,2019-07-13 -3,"F205",2023-02-06,2023-02-18 -3,"F206",2021-10-22,2021-11-06 -10,"F21",2014-04-18,2022-06-29 -10,"F22",2016-12-15,2018-11-28 -15,"F3",2017-12-08,2019-09-23 -15,"F31",2022-03-22,2023-08-20 -15,"E66",2009-05-18,2021-11-06 From c23e4b68ceb3abaeff468e57240dd2538ebf0401 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Thu, 31 Oct 2024 09:02:37 +0100 Subject: [PATCH 17/31] More cleanup --- .../cluster/WorkerUpdateMatchingStatsJob.java | 154 ------------------ ...ob.java => UpdateMatchingStatsSqlJob.java} | 8 +- .../models/config/SqlConnectorConfig.java | 8 +- .../specific/UpdateMatchingStatsMessage.java | 151 ++++++++++++++++- .../models/worker/LocalNamespace.java | 6 +- .../conquery/models/worker/Namespace.java | 2 +- 6 files changed, 161 insertions(+), 168 deletions(-) delete mode 100644 backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java rename backend/src/main/java/com/bakdata/conquery/mode/local/{SqlUpdateMatchingStatsJob.java => UpdateMatchingStatsSqlJob.java} (98%) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java deleted file mode 100644 index 71d65a3ffc..0000000000 --- a/backend/src/main/java/com/bakdata/conquery/mode/cluster/WorkerUpdateMatchingStatsJob.java +++ /dev/null @@ -1,154 +0,0 @@ -package com.bakdata.conquery.mode.cluster; - -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -import com.bakdata.conquery.models.datasets.Table; -import com.bakdata.conquery.models.datasets.concepts.Concept; -import com.bakdata.conquery.models.datasets.concepts.ConceptElement; -import com.bakdata.conquery.models.datasets.concepts.Connector; -import com.bakdata.conquery.models.datasets.concepts.MatchingStats; -import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; -import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; -import com.bakdata.conquery.models.events.Bucket; -import com.bakdata.conquery.models.events.CBlock; -import com.bakdata.conquery.models.identifiable.ids.specific.ConceptElementId; -import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; -import com.bakdata.conquery.models.jobs.Job; -import com.bakdata.conquery.models.messages.namespaces.specific.UpdateElementMatchingStats; -import com.bakdata.conquery.models.worker.Worker; -import com.bakdata.conquery.util.progressreporter.ProgressReporter; -import com.google.common.base.Functions; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@RequiredArgsConstructor -public class WorkerUpdateMatchingStatsJob extends Job { - private final Worker worker; - private final Collection concepts; - - @Override - public void execute() throws Exception { - if (worker.getStorage().getAllCBlocks().findAny().isEmpty()) { - log.debug("Worker {} is empty, skipping.", worker); - return; - } - - final ProgressReporter progressReporter = getProgressReporter(); - progressReporter.setMax(concepts.size()); - - log.info("BEGIN update Matching stats for {} Concepts", concepts.size()); - - final Map> - subJobs = - concepts.stream() - .collect(Collectors.toMap(Functions.identity(), - concept -> CompletableFuture.runAsync(() -> { - final Concept resolved = concept.resolve(); - final Map, MatchingStats.Entry> matchingStats = new HashMap<>(resolved.countElements()); - - calculateConceptMatches(resolved, matchingStats, worker); - worker.send(new UpdateElementMatchingStats(worker.getInfo().getId(), matchingStats)); - - progressReporter.report(1); - }, worker.getJobsExecutorService()) - )); - - - log.debug("All jobs submitted. Waiting for completion."); - - - final CompletableFuture all = CompletableFuture.allOf(subJobs.values().toArray(CompletableFuture[]::new)); - - do { - try { - all.get(1, TimeUnit.MINUTES); - } - catch (TimeoutException exception) { - // Count unfinished matching stats jobs. - if (log.isDebugEnabled()) { - final long unfinished = subJobs.values().stream().filter(Predicate.not(CompletableFuture::isDone)).count(); - log.debug("{} still waiting for {} tasks", worker.getInfo().getDataset(), unfinished); - } - - // When trace, also log the unfinished jobs. - if (log.isTraceEnabled()) { - subJobs.forEach((concept, future) -> { - if (future.isDone()) { - return; - } - - log.trace("Still waiting for `{}`", concept); - - }); - } - } - } while (!all.isDone()); - - log.debug("DONE collecting matching stats for {}", worker.getInfo().getDataset()); - - } - - @Override - public String getLabel() { - return String.format("Calculate Matching Stats for %s", worker.getInfo().getDataset()); - } - - private static void calculateConceptMatches(Concept concept, Map, MatchingStats.Entry> results, Worker worker) { - log.debug("BEGIN calculating for `{}`", concept.getId()); - - for (CBlock cBlock : worker.getStorage().getAllCBlocks().toList()) { - - if (!cBlock.getConnector().getConcept().equals(concept.getId())) { - continue; - } - - try { - final Bucket bucket = cBlock.getBucket().resolve(); - final Table table = bucket.getTable().resolve(); - - for (String entity : bucket.entities()) { - - final int entityEnd = bucket.getEntityEnd(entity); - - for (int event = bucket.getEntityStart(entity); event < entityEnd; event++) { - - final int[] localIds = cBlock.getPathToMostSpecificChild(event); - - - if (!(concept instanceof TreeConcept) || localIds == null) { - results.computeIfAbsent(concept.getId(), (ignored) -> new MatchingStats.Entry()).addEvent(table, bucket, event, entity); - continue; - } - - if (Connector.isNotContained(localIds)) { - continue; - } - - ConceptTreeNode element = ((TreeConcept) concept).getElementByLocalIdPath(localIds); - - while (element != null) { - results.computeIfAbsent(((ConceptElement) element).getId(), (ignored) -> new MatchingStats.Entry()) - .addEvent(table, bucket, event, entity); - element = element.getParent(); - } - } - } - - } - catch (Exception e) { - log.error("Failed to collect the matching stats for {}", cBlock, e); - } - } - - log.trace("DONE calculating for `{}`", concept.getId()); - } - -} diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java similarity index 98% rename from backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java rename to backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index 13ec43196e..db26dad32c 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/SqlUpdateMatchingStatsJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -57,7 +57,7 @@ import org.jooq.Table; @Slf4j -public class SqlUpdateMatchingStatsJob extends Job { +public class UpdateMatchingStatsSqlJob extends Job { private static final Name CONNECTOR_COLUMN = name("connector_column"); private static final Name EVENTS = name("events"); @@ -71,7 +71,7 @@ public class SqlUpdateMatchingStatsJob extends Job { private final Set concepts; private final ExecutorService executors; - public SqlUpdateMatchingStatsJob( + public UpdateMatchingStatsSqlJob( DatabaseConfig databaseConfig, SqlExecutionService executionService, SqlFunctionProvider functionProvider, @@ -101,7 +101,7 @@ public void execute() throws Exception { final List> runningQueries = concepts.stream() .map(ConceptId::resolve) - .filter(SqlUpdateMatchingStatsJob::isTreeConcept) + .filter(UpdateMatchingStatsSqlJob::isTreeConcept) .map(TreeConcept.class::cast) .map(treeConcept -> (Runnable) () -> calculateMatchingStats(treeConcept)) .map(executors::submit) @@ -115,7 +115,7 @@ public void execute() throws Exception { stopWatch.stop(); log.debug("DONE collecting matching stats. Elapsed time: {} ms.", stopWatch.getTime()); - runningQueries.forEach(SqlUpdateMatchingStatsJob::checkForError); + runningQueries.forEach(UpdateMatchingStatsSqlJob::checkForError); } @Override diff --git a/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java b/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java index e724d44400..1c0fe6be12 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java +++ b/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java @@ -1,14 +1,14 @@ package com.bakdata.conquery.models.config; import java.util.Map; -import jakarta.validation.Valid; -import jakarta.validation.constraints.Min; -import com.bakdata.conquery.mode.local.SqlUpdateMatchingStatsJob; +import com.bakdata.conquery.mode.local.UpdateMatchingStatsSqlJob; import com.bakdata.conquery.models.datasets.Dataset; import com.fasterxml.jackson.annotation.JsonIgnore; import io.dropwizard.util.Duration; import io.dropwizard.validation.ValidationMethod; +import jakarta.validation.Valid; +import jakarta.validation.constraints.Min; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; @@ -39,7 +39,7 @@ public class SqlConnectorConfig { private boolean withPrettyPrinting; /** - * The amount of threads for background tasks like calculating matching stats {@link SqlUpdateMatchingStatsJob}. + * The amount of threads for background tasks like calculating matching stats {@link UpdateMatchingStatsSqlJob}. */ @Min(1) @Builder.Default diff --git a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java index 82b9d66474..1113510eed 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java +++ b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java @@ -1,14 +1,33 @@ package com.bakdata.conquery.models.messages.namespaces.specific; import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Predicate; +import java.util.stream.Collectors; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.mode.cluster.WorkerUpdateMatchingStatsJob; +import com.bakdata.conquery.models.datasets.Table; +import com.bakdata.conquery.models.datasets.concepts.Concept; +import com.bakdata.conquery.models.datasets.concepts.ConceptElement; +import com.bakdata.conquery.models.datasets.concepts.Connector; +import com.bakdata.conquery.models.datasets.concepts.MatchingStats; +import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; +import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept; +import com.bakdata.conquery.models.events.Bucket; +import com.bakdata.conquery.models.events.CBlock; +import com.bakdata.conquery.models.identifiable.ids.specific.ConceptElementId; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; +import com.bakdata.conquery.models.jobs.Job; import com.bakdata.conquery.models.messages.namespaces.NamespacedMessage; import com.bakdata.conquery.models.messages.namespaces.WorkerMessage; import com.bakdata.conquery.models.worker.Worker; +import com.bakdata.conquery.util.progressreporter.ProgressReporter; import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.Functions; import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -27,6 +46,134 @@ public class UpdateMatchingStatsMessage extends WorkerMessage { @Override public void react(Worker worker) throws Exception { - worker.getJobManager().addSlowJob(new WorkerUpdateMatchingStatsJob(worker, concepts)); + worker.getJobManager().addSlowJob(new UpdateMatchingStatsWorkerJob(worker, concepts)); } + + @Slf4j + @RequiredArgsConstructor + public static class UpdateMatchingStatsWorkerJob extends Job { + private final Worker worker; + private final Collection concepts; + + @Override + public void execute() throws Exception { + if (worker.getStorage().getAllCBlocks().findAny().isEmpty()) { + log.debug("Worker {} is empty, skipping.", worker); + return; + } + + final ProgressReporter progressReporter = getProgressReporter(); + progressReporter.setMax(concepts.size()); + + log.info("BEGIN update Matching stats for {} Concepts", concepts.size()); + + final Map> + subJobs = + concepts.stream() + .collect(Collectors.toMap( + Functions.identity(), + concept -> CompletableFuture.runAsync(() -> { + final Concept resolved = concept.resolve(); + final Map, MatchingStats.Entry> matchingStats = new HashMap<>(resolved.countElements()); + + calculateConceptMatches(resolved, matchingStats, worker); + worker.send(new UpdateElementMatchingStats(worker.getInfo().getId(), matchingStats)); + + progressReporter.report(1); + }, worker.getJobsExecutorService()) + )); + + + log.debug("All jobs submitted. Waiting for completion."); + + + final CompletableFuture all = CompletableFuture.allOf(subJobs.values().toArray(CompletableFuture[]::new)); + + do { + try { + all.get(1, TimeUnit.MINUTES); + } + catch (TimeoutException exception) { + // Count unfinished matching stats jobs. + if (log.isDebugEnabled()) { + final long unfinished = subJobs.values().stream().filter(Predicate.not(CompletableFuture::isDone)).count(); + log.debug("{} still waiting for {} tasks", worker.getInfo().getDataset(), unfinished); + } + + // When trace, also log the unfinished jobs. + if (log.isTraceEnabled()) { + subJobs.forEach((concept, future) -> { + if (future.isDone()) { + return; + } + + log.trace("Still waiting for `{}`", concept); + + }); + } + } + } while (!all.isDone()); + + log.debug("DONE collecting matching stats for {}", worker.getInfo().getDataset()); + + } + + @Override + public String getLabel() { + return String.format("Calculate Matching Stats for %s", worker.getInfo().getDataset()); + } + + private static void calculateConceptMatches(Concept concept, Map, MatchingStats.Entry> results, Worker worker) { + log.debug("BEGIN calculating for `{}`", concept.getId()); + + for (CBlock cBlock : worker.getStorage().getAllCBlocks().toList()) { + + if (!cBlock.getConnector().getConcept().equals(concept.getId())) { + continue; + } + + try { + final Bucket bucket = cBlock.getBucket().resolve(); + final Table table = bucket.getTable().resolve(); + + for (String entity : bucket.entities()) { + + final int entityEnd = bucket.getEntityEnd(entity); + + for (int event = bucket.getEntityStart(entity); event < entityEnd; event++) { + + final int[] localIds = cBlock.getPathToMostSpecificChild(event); + + + if (!(concept instanceof TreeConcept) || localIds == null) { + results.computeIfAbsent(concept.getId(), (ignored) -> new MatchingStats.Entry()).addEvent(table, bucket, event, entity); + continue; + } + + if (Connector.isNotContained(localIds)) { + continue; + } + + ConceptTreeNode element = ((TreeConcept) concept).getElementByLocalIdPath(localIds); + + while (element != null) { + results.computeIfAbsent(((ConceptElement) element).getId(), (ignored) -> new MatchingStats.Entry()) + .addEvent(table, bucket, event, entity); + element = element.getParent(); + } + } + } + + } + catch (Exception e) { + log.error("Failed to collect the matching stats for {}", cBlock, e); + } + } + + log.trace("DONE calculating for `{}`", concept.getId()); + } + + } + + } diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java index 383fe5f4a4..e8bd291e27 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java @@ -12,7 +12,7 @@ import com.bakdata.conquery.io.storage.NamespaceStorage; import com.bakdata.conquery.mode.local.SqlEntityResolver; import com.bakdata.conquery.mode.local.SqlStorageHandler; -import com.bakdata.conquery.mode.local.SqlUpdateMatchingStatsJob; +import com.bakdata.conquery.mode.local.UpdateMatchingStatsSqlJob; import com.bakdata.conquery.models.config.DatabaseConfig; import com.bakdata.conquery.models.config.SqlConnectorConfig; import com.bakdata.conquery.models.datasets.Column; @@ -65,9 +65,9 @@ public LocalNamespace( @Override void updateMatchingStats() { - final Set concepts = collectConceptsWithoutMatchingStats(); + final Set concepts = getConceptsWithoutMatchingStats(); ExecutorService executorService = Executors.newFixedThreadPool(sqlConnectorConfig.getBackgroundThreads()); - Job job = new SqlUpdateMatchingStatsJob( + Job job = new UpdateMatchingStatsSqlJob( databaseConfig, sqlExecutionService, sqlDialect.getFunctionProvider(), diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java index 9821f5303f..7696cf4334 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/Namespace.java @@ -142,7 +142,7 @@ public void postprocessData() { } - protected Set collectConceptsWithoutMatchingStats() { + protected Set getConceptsWithoutMatchingStats() { return getStorage().getAllConcepts() .filter(concept -> concept.getMatchingStats() == null) .map(Concept::getId) From c850c4ea1a81f121fdea34ad56c7084c3faabf89 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Thu, 31 Oct 2024 11:42:23 +0100 Subject: [PATCH 18/31] Use ListenableFutures --- .../mode/local/LocalNamespaceHandler.java | 1 + .../mode/local/UpdateMatchingStatsSqlJob.java | 39 ++++++++----------- .../models/config/SqlConnectorConfig.java | 11 ------ .../models/worker/LocalNamespace.java | 10 +++-- .../sql/dialect/TestSqlConnectorConfig.java | 2 +- 5 files changed, 26 insertions(+), 37 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/LocalNamespaceHandler.java b/backend/src/main/java/com/bakdata/conquery/mode/local/LocalNamespaceHandler.java index 47f2d6379c..5bf80be772 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/LocalNamespaceHandler.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/LocalNamespaceHandler.java @@ -64,6 +64,7 @@ public LocalNamespace createNamespace( return new LocalNamespace( namespaceData.getPreprocessMapper(), namespaceStorage, + config.getQueries().getExecutionPool(), sqlConnectorConfig, databaseConfig, sqlDialect, diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index db26dad32c..225bcb1ac1 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -17,9 +17,7 @@ import java.util.Optional; import java.util.Set; import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import java.util.stream.Collectors; @@ -46,6 +44,9 @@ import com.bakdata.conquery.sql.execution.SqlExecutionService; import com.bakdata.conquery.util.CalculatedValue; import com.bakdata.conquery.util.TablePrimaryColumnUtil; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListeningExecutorService; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.time.StopWatch; import org.jooq.Condition; @@ -69,14 +70,14 @@ public class UpdateMatchingStatsSqlJob extends Job { private final DSLContext dslContext; private final SqlFunctionProvider functionProvider; private final Set concepts; - private final ExecutorService executors; + private final ListeningExecutorService executors; public UpdateMatchingStatsSqlJob( DatabaseConfig databaseConfig, SqlExecutionService executionService, SqlFunctionProvider functionProvider, Set concepts, - ExecutorService executors + ListeningExecutorService executors ) { this.databaseConfig = databaseConfig; this.executionService = executionService; @@ -95,27 +96,21 @@ public String getLabel() { public void execute() throws Exception { log.debug("BEGIN update Matching stats for {} Concepts.", concepts.size()); - final StopWatch stopWatch = new StopWatch(); stopWatch.start(); - final List> runningQueries = concepts.stream() - .map(ConceptId::resolve) - .filter(UpdateMatchingStatsSqlJob::isTreeConcept) - .map(TreeConcept.class::cast) - .map(treeConcept -> (Runnable) () -> calculateMatchingStats(treeConcept)) - .map(executors::submit) - .collect(Collectors.toList()); - - executors.shutdown(); - while (!executors.awaitTermination(1, TimeUnit.MINUTES)) { - log.debug("Waiting for executors to set matching stats for all concepts..."); - } - - stopWatch.stop(); - log.debug("DONE collecting matching stats. Elapsed time: {} ms.", stopWatch.getTime()); - - runningQueries.forEach(UpdateMatchingStatsSqlJob::checkForError); + final List> runningQueries = concepts.stream() + .map(ConceptId::resolve) + .filter(UpdateMatchingStatsSqlJob::isTreeConcept) + .map(TreeConcept.class::cast) + .map(treeConcept -> executors.submit(() -> calculateMatchingStats(treeConcept))) + .collect(Collectors.toList()); + + Futures.whenAllComplete(runningQueries).run(() -> { + stopWatch.stop(); + log.debug("DONE collecting matching stats. Elapsed time: {} ms.", stopWatch.getTime()); + runningQueries.forEach(UpdateMatchingStatsSqlJob::checkForError); + }, executors); } @Override diff --git a/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java b/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java index 1c0fe6be12..19d6393526 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java +++ b/backend/src/main/java/com/bakdata/conquery/models/config/SqlConnectorConfig.java @@ -2,13 +2,11 @@ import java.util.Map; -import com.bakdata.conquery.mode.local.UpdateMatchingStatsSqlJob; import com.bakdata.conquery.models.datasets.Dataset; import com.fasterxml.jackson.annotation.JsonIgnore; import io.dropwizard.util.Duration; import io.dropwizard.validation.ValidationMethod; import jakarta.validation.Valid; -import jakarta.validation.constraints.Min; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; @@ -29,8 +27,6 @@ @AllArgsConstructor public class SqlConnectorConfig { - private static final int DEFAULT_BACKGROUND_THREADS = 1; - private boolean enabled; /** @@ -38,13 +34,6 @@ public class SqlConnectorConfig { */ private boolean withPrettyPrinting; - /** - * The amount of threads for background tasks like calculating matching stats {@link UpdateMatchingStatsSqlJob}. - */ - @Min(1) - @Builder.Default - private int backgroundThreads = DEFAULT_BACKGROUND_THREADS; - /** * Keys must match the name of existing {@link Dataset}s. */ diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java index e8bd291e27..f3a290c3ff 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java @@ -3,8 +3,6 @@ import java.io.IOException; import java.util.List; import java.util.Set; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -15,6 +13,7 @@ import com.bakdata.conquery.mode.local.UpdateMatchingStatsSqlJob; import com.bakdata.conquery.models.config.DatabaseConfig; import com.bakdata.conquery.models.config.SqlConnectorConfig; +import com.bakdata.conquery.models.config.ThreadPoolDefinition; import com.bakdata.conquery.models.datasets.Column; import com.bakdata.conquery.models.identifiable.ids.specific.ConceptId; import com.bakdata.conquery.models.jobs.Job; @@ -25,6 +24,8 @@ import com.bakdata.conquery.sql.conversion.dialect.SqlDialect; import com.bakdata.conquery.sql.execution.SqlExecutionService; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.util.concurrent.ListeningExecutorService; +import com.google.common.util.concurrent.MoreExecutors; import lombok.Getter; import lombok.extern.slf4j.Slf4j; @@ -32,6 +33,7 @@ @Slf4j public class LocalNamespace extends Namespace { + private final ThreadPoolDefinition executionPool; private final SqlConnectorConfig sqlConnectorConfig; private final DatabaseConfig databaseConfig; private final SqlDialect sqlDialect; @@ -42,6 +44,7 @@ public class LocalNamespace extends Namespace { public LocalNamespace( ObjectMapper preprocessMapper, NamespaceStorage storage, + ThreadPoolDefinition executionPool, SqlConnectorConfig sqlConnectorConfig, DatabaseConfig databaseConfig, SqlDialect sqlDialect, @@ -55,6 +58,7 @@ public LocalNamespace( List injectables ) { super(preprocessMapper, storage, executionManager, jobManager, filterSearch, sqlEntityResolver, injectables); + this.executionPool = executionPool; this.sqlConnectorConfig = sqlConnectorConfig; this.databaseConfig = databaseConfig; this.sqlDialect = sqlDialect; @@ -66,7 +70,7 @@ public LocalNamespace( @Override void updateMatchingStats() { final Set concepts = getConceptsWithoutMatchingStats(); - ExecutorService executorService = Executors.newFixedThreadPool(sqlConnectorConfig.getBackgroundThreads()); + final ListeningExecutorService executorService = MoreExecutors.listeningDecorator(executionPool.createService("sql-matching-stats")); Job job = new UpdateMatchingStatsSqlJob( databaseConfig, sqlExecutionService, diff --git a/backend/src/test/java/com/bakdata/conquery/integration/sql/dialect/TestSqlConnectorConfig.java b/backend/src/test/java/com/bakdata/conquery/integration/sql/dialect/TestSqlConnectorConfig.java index ae02f110ec..bf3c1a5325 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/sql/dialect/TestSqlConnectorConfig.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/sql/dialect/TestSqlConnectorConfig.java @@ -15,7 +15,7 @@ public class TestSqlConnectorConfig extends SqlConnectorConfig { private static final String TEST_DATASET = "test"; public TestSqlConnectorConfig(DatabaseConfig databaseConfig) { - super(true, true, Runtime.getRuntime().availableProcessors(), Map.of(TEST_DATASET, databaseConfig), null); + super(true, true, Map.of(TEST_DATASET, databaseConfig), null); } @Override From 5be35d63c66c48f9318a5c4f4b046a9dbaea3197 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Thu, 31 Oct 2024 11:45:08 +0100 Subject: [PATCH 19/31] Revert changes in UpdateMatchingStatsMessage --- .../specific/UpdateMatchingStatsMessage.java | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java index 1113510eed..19f54c8d13 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java +++ b/backend/src/main/java/com/bakdata/conquery/models/messages/namespaces/specific/UpdateMatchingStatsMessage.java @@ -46,12 +46,11 @@ public class UpdateMatchingStatsMessage extends WorkerMessage { @Override public void react(Worker worker) throws Exception { - worker.getJobManager().addSlowJob(new UpdateMatchingStatsWorkerJob(worker, concepts)); + worker.getJobManager().addSlowJob(new UpdateMatchingStatsJob(worker, concepts)); } - @Slf4j @RequiredArgsConstructor - public static class UpdateMatchingStatsWorkerJob extends Job { + private static class UpdateMatchingStatsJob extends Job { private final Worker worker; private final Collection concepts; @@ -70,17 +69,19 @@ public void execute() throws Exception { final Map> subJobs = concepts.stream() - .collect(Collectors.toMap( - Functions.identity(), - concept -> CompletableFuture.runAsync(() -> { - final Concept resolved = concept.resolve(); - final Map, MatchingStats.Entry> matchingStats = new HashMap<>(resolved.countElements()); + .collect(Collectors.toMap(Functions.identity(), + concept -> CompletableFuture.runAsync(() -> { + final Concept resolved = concept.resolve(); + final Map, MatchingStats.Entry> + matchingStats = + new HashMap<>(resolved.countElements()); - calculateConceptMatches(resolved, matchingStats, worker); - worker.send(new UpdateElementMatchingStats(worker.getInfo().getId(), matchingStats)); + calculateConceptMatches(resolved, matchingStats, worker); - progressReporter.report(1); - }, worker.getJobsExecutorService()) + worker.send(new UpdateElementMatchingStats(worker.getInfo().getId(), matchingStats)); + + progressReporter.report(1); + }, worker.getJobsExecutorService()) )); From 0988c8c3fe0b0ec497de061b7437f0d11710f18e Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Tue, 5 Nov 2024 11:21:36 +0100 Subject: [PATCH 20/31] Draft --- .../mode/local/UpdateMatchingStatsSqlJob.java | 61 ++++++++----------- .../models/worker/LocalNamespace.java | 4 +- 2 files changed, 27 insertions(+), 38 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index 225bcb1ac1..9ff5cee85a 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -17,6 +17,7 @@ import java.util.Optional; import java.util.Set; import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; @@ -47,6 +48,7 @@ import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; +import com.google.common.util.concurrent.MoreExecutors; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.time.StopWatch; import org.jooq.Condition; @@ -77,14 +79,15 @@ public UpdateMatchingStatsSqlJob( SqlExecutionService executionService, SqlFunctionProvider functionProvider, Set concepts, - ListeningExecutorService executors + ExecutorService executors ) { this.databaseConfig = databaseConfig; this.executionService = executionService; this.dslContext = executionService.getDslContext(); this.functionProvider = functionProvider; this.concepts = concepts; - this.executors = executors; + this.executors = MoreExecutors.listeningDecorator(executors); + ; } @Override @@ -106,17 +109,14 @@ public void execute() throws Exception { .map(treeConcept -> executors.submit(() -> calculateMatchingStats(treeConcept))) .collect(Collectors.toList()); - Futures.whenAllComplete(runningQueries).run(() -> { - stopWatch.stop(); - log.debug("DONE collecting matching stats. Elapsed time: {} ms.", stopWatch.getTime()); - runningQueries.forEach(UpdateMatchingStatsSqlJob::checkForError); - }, executors); - } + final ListenableFuture> futureList = Futures.allAsList(runningQueries); + while (!futureList.isDone()) { + log.debug("Waiting for executors collect matching stats..."); + } - @Override - public void cancel() { - super.cancel(); - executors.shutdownNow(); + stopWatch.stop(); + log.debug("DONE collecting matching stats. Elapsed time: {} ms.", stopWatch.getTime()); + runningQueries.forEach(UpdateMatchingStatsSqlJob::checkForError); } private static boolean isTreeConcept(final Concept concept) { @@ -196,31 +196,22 @@ private Map>> collectRelevantColumns(final TreeConcept t } private Set collectRelevantColumns(final Connector connector, final List children) { - final Set relevantColumns = new HashSet<>(); - - for (ConceptTreeChild child : children) { - if (child.getCondition() == null && child.getChildren().isEmpty()) { - continue; - } - - final Set childColumns = new HashSet<>(); - - // Recursively collect columns from the current child's children, if they exist - if (!child.getChildren().isEmpty()) { - final Set childrenColumns = collectRelevantColumns(connector, child.getChildren()); - childColumns.addAll(childrenColumns); - } - - // Add columns from the child's condition, if it exists - if (child.getCondition() != null) { - final Set conditionColumns = child.getCondition().getColumns(connector); - childColumns.addAll(conditionColumns); - } + return children.stream().flatMap(child -> collectRelevantColumns(connector, child).stream()).collect(Collectors.toSet()); + } - relevantColumns.addAll(childColumns); + private Set collectRelevantColumns(final Connector connector, final ConceptTreeChild child) { + final Set childColumns = new HashSet<>(); + // Recursively collect columns from the current child's children, if they exist + if (!child.getChildren().isEmpty()) { + final Set childrenColumns = collectRelevantColumns(connector, child.getChildren()); + childColumns.addAll(childrenColumns); } - - return relevantColumns; + // Add columns from the child's condition, if it exists + if (child.getCondition() != null) { + final Set conditionColumns = child.getCondition().getColumns(connector); + childColumns.addAll(conditionColumns); + } + return childColumns; } private Map> createColumnDateRanges(final TreeConcept treeConcept) { diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java index f3a290c3ff..b1523936e7 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java @@ -24,8 +24,6 @@ import com.bakdata.conquery.sql.conversion.dialect.SqlDialect; import com.bakdata.conquery.sql.execution.SqlExecutionService; import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.util.concurrent.ListeningExecutorService; -import com.google.common.util.concurrent.MoreExecutors; import lombok.Getter; import lombok.extern.slf4j.Slf4j; @@ -70,7 +68,7 @@ public LocalNamespace( @Override void updateMatchingStats() { final Set concepts = getConceptsWithoutMatchingStats(); - final ListeningExecutorService executorService = MoreExecutors.listeningDecorator(executionPool.createService("sql-matching-stats")); + Job job = new UpdateMatchingStatsSqlJob( databaseConfig, sqlExecutionService, From d0b02b71744ec61ba8cebff787733fe8021670c3 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 6 Nov 2024 09:35:01 +0100 Subject: [PATCH 21/31] Refactoring --- .../mode/local/UpdateMatchingStatsSqlJob.java | 74 ++++++++++++------- .../models/worker/LocalNamespace.java | 5 +- 2 files changed, 53 insertions(+), 26 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index 9ff5cee85a..06d1ba261a 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -16,9 +16,9 @@ import java.util.Map; import java.util.Optional; import java.util.Set; -import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import java.util.stream.Collectors; @@ -51,6 +51,7 @@ import com.google.common.util.concurrent.MoreExecutors; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.time.StopWatch; +import org.jooq.AggregateFunction; import org.jooq.Condition; import org.jooq.DSLContext; import org.jooq.Field; @@ -66,6 +67,8 @@ public class UpdateMatchingStatsSqlJob extends Job { private static final Name EVENTS = name("events"); private static final Name ENTITIES = name("entities"); private static final Name DATES = name("dates"); + private static final Name MIN_DATE = name("min_date"); + private static final Name MAX_DATE = name("max_date"); private final DatabaseConfig databaseConfig; private final SqlExecutionService executionService; @@ -73,6 +76,7 @@ public class UpdateMatchingStatsSqlJob extends Job { private final SqlFunctionProvider functionProvider; private final Set concepts; private final ListeningExecutorService executors; + private ListenableFuture all; public UpdateMatchingStatsSqlJob( DatabaseConfig databaseConfig, @@ -87,7 +91,6 @@ public UpdateMatchingStatsSqlJob( this.functionProvider = functionProvider; this.concepts = concepts; this.executors = MoreExecutors.listeningDecorator(executors); - ; } @Override @@ -109,14 +112,29 @@ public void execute() throws Exception { .map(treeConcept -> executors.submit(() -> calculateMatchingStats(treeConcept))) .collect(Collectors.toList()); - final ListenableFuture> futureList = Futures.allAsList(runningQueries); - while (!futureList.isDone()) { - log.debug("Waiting for executors collect matching stats..."); + all = Futures.allAsList(runningQueries); + while (!all.isDone()) { + try { + all.get(1, TimeUnit.MINUTES); + } + catch (TimeoutException exception) { + log.debug("Still waiting for {}", this); + if (log.isTraceEnabled()) { + log.trace("Waiting for {}", executors); + } + } } stopWatch.stop(); log.debug("DONE collecting matching stats. Elapsed time: {} ms.", stopWatch.getTime()); - runningQueries.forEach(UpdateMatchingStatsSqlJob::checkForError); + } + + @Override + public void cancel() { + if (all != null) { + all.cancel(true); + } + super.cancel(); } private static boolean isTreeConcept(final Concept concept) { @@ -127,15 +145,6 @@ private static boolean isTreeConcept(final Concept concept) { return true; } - private static void checkForError(final Future future) { - try { - future.get(); - } - catch (ExecutionException | InterruptedException e) { - log.error("Unknown error while querying SQL matching stats. Cause: \n", e.getCause()); - } - } - public void calculateMatchingStats(final TreeConcept treeConcept) { final Map>> relevantColumns = collectRelevantColumns(treeConcept); @@ -151,8 +160,8 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); - final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); - final Field validityDateExpression = functionProvider.daterangeStringExpression(minAndMax).as(DATES); + final AggregateFunction minDate = min(functionProvider.least(allStarts)); + final AggregateFunction maxDate = max(functionProvider.greatest((allEnds))); // all connectors need the same columns originating from the concept definition - they might have different names in the respective connector tables, // but as we aliased them already, we can just use the unified aliases in the final query @@ -164,7 +173,8 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { .select( count(asterisk()).as(EVENTS), countDistinct(field(ENTITIES)).as(ENTITIES), - validityDateExpression + minDate.as(MIN_DATE), + maxDate.as(MAX_DATE) ) .from(unioned) .groupBy(relevantColumnsAliased); @@ -303,16 +313,30 @@ private void mapRecordToConceptElements( }); } - private MatchingStats.Entry toMatchingStatsEntry(Record record) { + private MatchingStats.Entry toMatchingStatsEntry(final Record record) { final long events = record.get(EVENTS, Integer.class).longValue(); final long entities = record.get(ENTITIES, Integer.class).longValue(); - final CDateRange dateSpan = toDateRange(record.get(DATES, String.class)); - return new MatchingStats.Entry(events, entities, dateSpan.getMinValue(), dateSpan.getMaxValue()); + + final int minDate = record.get(MIN_DATE, Date.class) == null + ? CDateRange.NEGATIVE_INFINITY + : toValidInt(record.get(MIN_DATE, Date.class)); + + final int maxDate = record.get(MAX_DATE, Date.class) == null + ? CDateRange.POSITIVE_INFINITY + : toValidInt(record.get(MAX_DATE, Date.class)); + + return new MatchingStats.Entry(events, entities, minDate, maxDate); } - private CDateRange toDateRange(final String validityDateExpression) { - final List dateRange = executionService.getResultSetProcessor().getCDateSetParser().toEpochDayRange(validityDateExpression); - return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : CDateRange.all(); + private static int toValidInt(final Date date) { + final long epochDay = date.toLocalDate().toEpochDay(); + if (epochDay < Integer.MIN_VALUE) { + return Integer.MIN_VALUE; + } + else if (epochDay > Integer.MAX_VALUE) { + return Integer.MAX_VALUE; + } + return Math.toIntExact(epochDay); } private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { diff --git a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java index b1523936e7..c8e62616f6 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java +++ b/backend/src/main/java/com/bakdata/conquery/models/worker/LocalNamespace.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.util.List; import java.util.Set; +import java.util.concurrent.ThreadPoolExecutor; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -38,6 +39,7 @@ public class LocalNamespace extends Namespace { private final SqlExecutionService sqlExecutionService; private final DSLContextWrapper dslContextWrapper; private final SqlStorageHandler storageHandler; + private final ThreadPoolExecutor executorService; public LocalNamespace( ObjectMapper preprocessMapper, @@ -63,12 +65,13 @@ public LocalNamespace( this.sqlExecutionService = sqlExecutionService; this.dslContextWrapper = dslContextWrapper; this.storageHandler = storageHandler; + // TODO FK: hoist into Namespace and use at other places && is this the correct way to name them? + this.executorService = this.executionPool.createService("namespace %s worker".formatted(storage.getPathName())); } @Override void updateMatchingStats() { final Set concepts = getConceptsWithoutMatchingStats(); - Job job = new UpdateMatchingStatsSqlJob( databaseConfig, sqlExecutionService, From 20fc92e5558745505debc6cf38b61c6bbcdfadf9 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 6 Nov 2024 09:51:41 +0100 Subject: [PATCH 22/31] Revert changes --- .../sql/execution/DefaultResultSetProcessor.java | 12 +++++------- .../sql/execution/DefaultSqlCDateSetParser.java | 9 ++------- .../conquery/sql/execution/ResultSetProcessor.java | 2 -- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java index 531bd7fbe5..b17d3db261 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java @@ -11,15 +11,13 @@ import com.bakdata.conquery.models.config.ConqueryConfig; import com.bakdata.conquery.util.DateReader; -import lombok.Getter; import lombok.RequiredArgsConstructor; @RequiredArgsConstructor class DefaultResultSetProcessor implements ResultSetProcessor { private final ConqueryConfig config; - @Getter - private final SqlCDateSetParser cDateSetParser; + private final SqlCDateSetParser sqlCDateSetParser; @Override public String getString(ResultSet resultSet, int columnIndex) throws SQLException { @@ -62,12 +60,12 @@ public Integer getDate(ResultSet resultSet, int columnIndex) throws SQLException @Override public List getDateRange(ResultSet resultSet, int columnIndex) throws SQLException { - return this.cDateSetParser.toEpochDayRange(resultSet.getString(columnIndex)); + return this.sqlCDateSetParser.toEpochDayRange(resultSet.getString(columnIndex)); } @Override public List> getDateRangeList(ResultSet resultSet, int columnIndex) throws SQLException { - return this.cDateSetParser.toEpochDayRangeList(resultSet.getString(columnIndex)); + return this.sqlCDateSetParser.toEpochDayRangeList(resultSet.getString(columnIndex)); } @Override @@ -121,7 +119,7 @@ private List fromString(ResultSet resultSet, int columnIndex, Function * For example, calling a primitives' ResultSet getter like getDouble, getInt etc. straightaway will never return null. */ - private static T checkForNullElseGet(ResultSet resultSet, int columnIndex, GetMethod getter, Class resultType) throws SQLException { + private static T checkForNullElseGet(ResultSet resultSet, int columnIndex, Getter getter, Class resultType) throws SQLException { if (resultSet.getObject(columnIndex) == null) { return null; diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java index 8f7bf0c839..29b7fca7dc 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java @@ -12,11 +12,6 @@ public class DefaultSqlCDateSetParser implements SqlCDateSetParser { - /** - * Postgres daterange function creates this expression when called with null-arguments instead of null. - */ - public static final String POSTGRES_NULL_RANGE = "(,)"; - public static final String EMPTY_RANGE_BRACES = "{}"; public static final String DATE_SEPARATOR = ","; public static final char INCLUDED_START_CHAR = '['; @@ -42,12 +37,12 @@ public List> toEpochDayRangeList(String multiDateRange) { @Override public List toEpochDayRange(String daterange) { - if (daterange == null || daterange.equals(POSTGRES_NULL_RANGE)) { + if (daterange == null) { return Collections.emptyList(); } String[] dates = daterange.split(DATE_SEPARATOR); - Preconditions.checkArgument(dates.length == 2, "Dateranges must have a start and end. Input was: %s".formatted(daterange)); + Preconditions.checkArgument(dates.length == 2, "Dateranges must have a start and end."); // the dateranges have always an included start date marked by a [ String startDateExpression = dates[0]; diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java index a7e6498751..074716073d 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java @@ -9,8 +9,6 @@ public interface ResultSetProcessor { char UNIT_SEPARATOR = (char) 31; // https://www.ascii-code.com/character/%E2%90%9F - SqlCDateSetParser getCDateSetParser(); - String getString(ResultSet resultSet, int columnIndex) throws SQLException; Integer getInteger(ResultSet resultSet, int columnIndex) throws SQLException; From 2f2a4aee6d4962322bcd8ebeb073528cbae6ccbd Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 6 Nov 2024 12:18:17 +0100 Subject: [PATCH 23/31] Revert "Revert changes" This reverts commit 20fc92e5558745505debc6cf38b61c6bbcdfadf9. --- .../sql/execution/DefaultResultSetProcessor.java | 12 +++++++----- .../sql/execution/DefaultSqlCDateSetParser.java | 9 +++++++-- .../conquery/sql/execution/ResultSetProcessor.java | 2 ++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java index b17d3db261..531bd7fbe5 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultResultSetProcessor.java @@ -11,13 +11,15 @@ import com.bakdata.conquery.models.config.ConqueryConfig; import com.bakdata.conquery.util.DateReader; +import lombok.Getter; import lombok.RequiredArgsConstructor; @RequiredArgsConstructor class DefaultResultSetProcessor implements ResultSetProcessor { private final ConqueryConfig config; - private final SqlCDateSetParser sqlCDateSetParser; + @Getter + private final SqlCDateSetParser cDateSetParser; @Override public String getString(ResultSet resultSet, int columnIndex) throws SQLException { @@ -60,12 +62,12 @@ public Integer getDate(ResultSet resultSet, int columnIndex) throws SQLException @Override public List getDateRange(ResultSet resultSet, int columnIndex) throws SQLException { - return this.sqlCDateSetParser.toEpochDayRange(resultSet.getString(columnIndex)); + return this.cDateSetParser.toEpochDayRange(resultSet.getString(columnIndex)); } @Override public List> getDateRangeList(ResultSet resultSet, int columnIndex) throws SQLException { - return this.sqlCDateSetParser.toEpochDayRangeList(resultSet.getString(columnIndex)); + return this.cDateSetParser.toEpochDayRangeList(resultSet.getString(columnIndex)); } @Override @@ -119,7 +121,7 @@ private List fromString(ResultSet resultSet, int columnIndex, Function * For example, calling a primitives' ResultSet getter like getDouble, getInt etc. straightaway will never return null. */ - private static T checkForNullElseGet(ResultSet resultSet, int columnIndex, Getter getter, Class resultType) throws SQLException { + private static T checkForNullElseGet(ResultSet resultSet, int columnIndex, GetMethod getter, Class resultType) throws SQLException { if (resultSet.getObject(columnIndex) == null) { return null; diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java index 29b7fca7dc..8f7bf0c839 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/DefaultSqlCDateSetParser.java @@ -12,6 +12,11 @@ public class DefaultSqlCDateSetParser implements SqlCDateSetParser { + /** + * Postgres daterange function creates this expression when called with null-arguments instead of null. + */ + public static final String POSTGRES_NULL_RANGE = "(,)"; + public static final String EMPTY_RANGE_BRACES = "{}"; public static final String DATE_SEPARATOR = ","; public static final char INCLUDED_START_CHAR = '['; @@ -37,12 +42,12 @@ public List> toEpochDayRangeList(String multiDateRange) { @Override public List toEpochDayRange(String daterange) { - if (daterange == null) { + if (daterange == null || daterange.equals(POSTGRES_NULL_RANGE)) { return Collections.emptyList(); } String[] dates = daterange.split(DATE_SEPARATOR); - Preconditions.checkArgument(dates.length == 2, "Dateranges must have a start and end."); + Preconditions.checkArgument(dates.length == 2, "Dateranges must have a start and end. Input was: %s".formatted(daterange)); // the dateranges have always an included start date marked by a [ String startDateExpression = dates[0]; diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java index 074716073d..a7e6498751 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/ResultSetProcessor.java @@ -9,6 +9,8 @@ public interface ResultSetProcessor { char UNIT_SEPARATOR = (char) 31; // https://www.ascii-code.com/character/%E2%90%9F + SqlCDateSetParser getCDateSetParser(); + String getString(ResultSet resultSet, int columnIndex) throws SQLException; Integer getInteger(ResultSet resultSet, int columnIndex) throws SQLException; From edd2f7335345d6fb4939c55a47ba454e67d2667c Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 6 Nov 2024 12:48:03 +0100 Subject: [PATCH 24/31] Fix --- .../mode/local/UpdateMatchingStatsSqlJob.java | 86 +++++++++---------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index 06d1ba261a..1455eb389e 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -8,9 +8,11 @@ import static org.jooq.impl.DSL.min; import static org.jooq.impl.DSL.name; import static org.jooq.impl.DSL.noCondition; +import static org.jooq.impl.DSL.noField; import static org.jooq.impl.DSL.table; import java.sql.Date; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -51,13 +53,13 @@ import com.google.common.util.concurrent.MoreExecutors; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.time.StopWatch; -import org.jooq.AggregateFunction; import org.jooq.Condition; import org.jooq.DSLContext; import org.jooq.Field; import org.jooq.Name; import org.jooq.Record; import org.jooq.Select; +import org.jooq.SelectJoinStep; import org.jooq.Table; @Slf4j @@ -67,8 +69,6 @@ public class UpdateMatchingStatsSqlJob extends Job { private static final Name EVENTS = name("events"); private static final Name ENTITIES = name("entities"); private static final Name DATES = name("dates"); - private static final Name MIN_DATE = name("min_date"); - private static final Name MAX_DATE = name("max_date"); private final DatabaseConfig databaseConfig; private final SqlExecutionService executionService; @@ -156,31 +156,28 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { .reduce(Select::unionAll) .orElseThrow(IllegalStateException::new); - // select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors - final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); - final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); - final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); - final AggregateFunction minDate = min(functionProvider.least(allStarts)); - final AggregateFunction maxDate = max(functionProvider.greatest((allEnds))); - // all connectors need the same columns originating from the concept definition - they might have different names in the respective connector tables, // but as we aliased them already, we can just use the unified aliases in the final query final List> relevantColumnsAliased = relevantColumns.get(treeConcept.getConnectors().get(0)).stream() .map(field -> field(field.getUnqualifiedName())) .collect(Collectors.toList()); - final Select query = dslContext.select(relevantColumnsAliased) - .select( - count(asterisk()).as(EVENTS), - countDistinct(field(ENTITIES)).as(ENTITIES), - minDate.as(MIN_DATE), - maxDate.as(MAX_DATE) - ) - .from(unioned) - .groupBy(relevantColumnsAliased); + // if there is no validity date at all, we select no field + final Field validityDateExpression = validityDateMap.isEmpty() ? noField() : toValidityDateExpression(validityDateMap); + + final SelectJoinStep query = dslContext.select(relevantColumnsAliased) + .select( + count(asterisk()).as(EVENTS), + countDistinct(field(ENTITIES)).as(ENTITIES), + validityDateExpression.as(DATES) + ) + .from(unioned); + + // not all dialects accept an empty group by () clause + final Select finalQuery = relevantColumnsAliased.isEmpty() ? query : query.groupBy(relevantColumnsAliased); final ConceptTreeCache treeCache = new ConceptTreeCache(treeConcept); - executionService.fetchStream(query).forEach(record -> mapRecordToConceptElements(treeConcept, record, relevantColumnsAliased, treeCache)); + executionService.fetchStream(finalQuery).forEach(record -> mapRecordToConceptElements(treeConcept, record, relevantColumnsAliased, treeCache)); } /** @@ -225,11 +222,15 @@ private Set collectRelevantColumns(final Connector connector, final Conc } private Map> createColumnDateRanges(final TreeConcept treeConcept) { + final Map> map = new HashMap<>(); final AtomicInteger counter = new AtomicInteger(0); - return treeConcept.getConnectors().stream().collect(Collectors.toMap( - Function.identity(), - connector -> createColumnDateRanges(connector, counter) - )); + for (final ConceptTreeConnector connector : treeConcept.getConnectors()) { + if (connector.getValidityDates().isEmpty()) { + continue; + } + map.put(connector, createColumnDateRanges(connector, counter)); + } + return map; } private List createColumnDateRanges(final Connector connector, final AtomicInteger counter) { @@ -274,6 +275,17 @@ private Condition toJooqCondition(final Connector connector, final Optional toValidityDateExpression(final Map> validityDateMap) { + final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); + final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); + final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); + final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); + return functionProvider.daterangeStringExpression(minAndMax); + } + private void mapRecordToConceptElements( final TreeConcept treeConcept, final Record record, @@ -313,30 +325,16 @@ private void mapRecordToConceptElements( }); } - private MatchingStats.Entry toMatchingStatsEntry(final Record record) { + private MatchingStats.Entry toMatchingStatsEntry(Record record) { final long events = record.get(EVENTS, Integer.class).longValue(); final long entities = record.get(ENTITIES, Integer.class).longValue(); - - final int minDate = record.get(MIN_DATE, Date.class) == null - ? CDateRange.NEGATIVE_INFINITY - : toValidInt(record.get(MIN_DATE, Date.class)); - - final int maxDate = record.get(MAX_DATE, Date.class) == null - ? CDateRange.POSITIVE_INFINITY - : toValidInt(record.get(MAX_DATE, Date.class)); - - return new MatchingStats.Entry(events, entities, minDate, maxDate); + final CDateRange dateSpan = toDateRange(record.get(DATES, String.class)); + return new MatchingStats.Entry(events, entities, dateSpan.getMinValue(), dateSpan.getMaxValue()); } - private static int toValidInt(final Date date) { - final long epochDay = date.toLocalDate().toEpochDay(); - if (epochDay < Integer.MIN_VALUE) { - return Integer.MIN_VALUE; - } - else if (epochDay > Integer.MAX_VALUE) { - return Integer.MAX_VALUE; - } - return Math.toIntExact(epochDay); + private CDateRange toDateRange(final String validityDateExpression) { + final List dateRange = executionService.getResultSetProcessor().getCDateSetParser().toEpochDayRange(validityDateExpression); + return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : CDateRange.all(); } private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { From 944fc14ac4c9454402f157b41983258907311c55 Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:07:00 +0100 Subject: [PATCH 25/31] cleanup CTCondition.getAuxillaryColumns additionally minor simplification in UpdateMatchingStatsSqlJob --- .../mode/local/UpdateMatchingStatsSqlJob.java | 178 +++++++++--------- .../concepts/conditions/AndCondition.java | 9 +- .../concepts/conditions/CTCondition.java | 3 +- .../conditions/ColumnEqualCondition.java | 5 +- .../concepts/conditions/EqualCondition.java | 8 +- .../concepts/conditions/GroovyCondition.java | 5 +- .../conditions/IsPresentCondition.java | 3 +- .../concepts/conditions/NotCondition.java | 7 +- .../concepts/conditions/OrCondition.java | 9 +- .../concepts/conditions/PrefixCondition.java | 8 +- .../conditions/PrefixRangeCondition.java | 8 +- 11 files changed, 117 insertions(+), 126 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index 1455eb389e..6f5d2d1897 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -1,22 +1,13 @@ package com.bakdata.conquery.mode.local; -import static org.jooq.impl.DSL.asterisk; -import static org.jooq.impl.DSL.count; -import static org.jooq.impl.DSL.countDistinct; -import static org.jooq.impl.DSL.field; -import static org.jooq.impl.DSL.max; -import static org.jooq.impl.DSL.min; -import static org.jooq.impl.DSL.name; -import static org.jooq.impl.DSL.noCondition; -import static org.jooq.impl.DSL.noField; -import static org.jooq.impl.DSL.table; +import static org.jooq.impl.DSL.*; import java.sql.Date; +import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; @@ -93,6 +84,22 @@ public UpdateMatchingStatsSqlJob( this.executors = MoreExecutors.listeningDecorator(executors); } + private static boolean isTreeConcept(final Concept concept) { + if (!(concept instanceof TreeConcept)) { + log.error("Collecting MatchingStats is currently only supported for TreeConcepts."); + return false; + } + return true; + } + + private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { + if (mostSpecificChild.getMatchingStats() == null) { + ((ConceptElement) mostSpecificChild).setMatchingStats(new MatchingStats()); + } + + mostSpecificChild.getMatchingStats().putEntry(columnKey, entry); + } + @Override public String getLabel() { return "Calculating Matching Stats for %s.".formatted(executionService); @@ -137,14 +144,6 @@ public void cancel() { super.cancel(); } - private static boolean isTreeConcept(final Concept concept) { - if (!(concept instanceof TreeConcept)) { - log.error("Collecting MatchingStats is currently only supported for TreeConcepts."); - return false; - } - return true; - } - public void calculateMatchingStats(final TreeConcept treeConcept) { final Map>> relevantColumns = collectRelevantColumns(treeConcept); @@ -152,7 +151,7 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { // union of all connectors of the concept final Select unioned = treeConcept.getConnectors().stream() - .map(connector -> this.createConnectorQuery(connector, relevantColumns, validityDateMap)) + .map(connector -> createConnectorQuery(connector, relevantColumns, validityDateMap)) .reduce(Select::unionAll) .orElseThrow(IllegalStateException::new); @@ -177,7 +176,8 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { final Select finalQuery = relevantColumnsAliased.isEmpty() ? query : query.groupBy(relevantColumnsAliased); final ConceptTreeCache treeCache = new ConceptTreeCache(treeConcept); - executionService.fetchStream(finalQuery).forEach(record -> mapRecordToConceptElements(treeConcept, record, relevantColumnsAliased, treeCache)); + executionService.fetchStream(finalQuery) + .forEach(record -> mapRecordToConceptElements(treeConcept, record, treeCache)); } /** @@ -185,37 +185,41 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { * {@link CTCondition} which is part of any child of a concept, or it's a concept's connector column. */ private Map>> collectRelevantColumns(final TreeConcept treeConcept) { - return treeConcept.getConnectors().stream().collect(Collectors.toMap( - Function.identity(), - connector -> collectRelevantColumns(connector, treeConcept.getChildren()) - .stream() - .map(column -> { - final Field field = field(name(column)); - // connector columns are unioned, thus they need the same alias - if (connector.getColumn() != null && connector.getColumn().resolve().getName().equals(column)) { - return field.as(CONNECTOR_COLUMN); - } - // a condition which does not operate on the connector column MUST have the same name in all connector's tables - return field; - }) - .collect(Collectors.toSet()) - )); + return treeConcept.getConnectors().stream() + .collect(Collectors.toMap( + Function.identity(), + connector -> collectRelevantColumns(connector, treeConcept) + )); + } + + private Set> collectRelevantColumns(final Connector connector, TreeConcept concept) { + final Set> out = new HashSet<>(); + + if (connector.getColumn() != null) { + out.add(field(name(connector.getColumn().resolve().getName())).as(CONNECTOR_COLUMN)); + } + + for (String name : collectRelevantColumns(concept.getChildren())) { + out.add(field(name(name))); + } + + return out; } - private Set collectRelevantColumns(final Connector connector, final List children) { - return children.stream().flatMap(child -> collectRelevantColumns(connector, child).stream()).collect(Collectors.toSet()); + private Set collectRelevantColumns(final List children) { + return children.stream().flatMap(child -> collectRelevantColumns(child).stream()).collect(Collectors.toSet()); } - private Set collectRelevantColumns(final Connector connector, final ConceptTreeChild child) { + private Set collectRelevantColumns(final ConceptTreeChild child) { final Set childColumns = new HashSet<>(); // Recursively collect columns from the current child's children, if they exist if (!child.getChildren().isEmpty()) { - final Set childrenColumns = collectRelevantColumns(connector, child.getChildren()); + final Set childrenColumns = collectRelevantColumns(child.getChildren()); childColumns.addAll(childrenColumns); } // Add columns from the child's condition, if it exists if (child.getCondition() != null) { - final Set conditionColumns = child.getCondition().getColumns(connector); + final Set conditionColumns = child.getCondition().getAuxillaryColumns(); childColumns.addAll(conditionColumns); } return childColumns; @@ -249,17 +253,27 @@ private Select createConnectorQuery( final Set> connectorColumns = relevantColumns.get(connector); final Field primaryKey = TablePrimaryColumnUtil.findPrimaryColumn(connector.getResolvedTable(), databaseConfig).as(ENTITIES); - // we have to select all possible validity dates of all connectors because we have to union multiple connectors - final List> validityDates = - validityDateMap.entrySet().stream() - .flatMap(entry -> entry.getValue().stream().map(columnDateRange -> entry.getKey() == connector - ? columnDateRange - : functionProvider.nulled(columnDateRange)) - .flatMap(columnDateRange -> columnDateRange.toFields().stream())) - .toList(); + final List> validityDates = new ArrayList<>(); + + for (Map.Entry> entry : validityDateMap.entrySet()) { + for (ColumnDateRange columnDateRange : entry.getValue()) { + + // we have to select all possible validity dates of all connectors because we have to union multiple connectors + ColumnDateRange dateRange = columnDateRange; + + // Therefore we usually select null + if (entry.getKey() != connector) { + dateRange = functionProvider.nulled(columnDateRange); + } + + validityDates.addAll(dateRange.toFields()); + } + } // connector might have a condition - final Condition connectorCondition = toJooqCondition(connector, Optional.ofNullable(connector.getCondition())); + final Condition connectorCondition = connector.getCondition() == null + ? noCondition() + : toJooqCondition(connector, connector.getCondition()); return dslContext.select(primaryKey) .select(connectorColumns) @@ -268,11 +282,9 @@ private Select createConnectorQuery( .where(connectorCondition); } - private Condition toJooqCondition(final Connector connector, final Optional childCondition) { + private Condition toJooqCondition(final Connector connector, CTCondition childCondition) { final CTConditionContext context = CTConditionContext.create(connector, functionProvider); - return childCondition.or(() -> Optional.ofNullable(connector.getCondition())) - .map(condition -> condition.convertToSqlCondition(context).condition()) - .orElse(noCondition()); + return childCondition.convertToSqlCondition(context).condition(); } /** @@ -286,12 +298,7 @@ private Field toValidityDateExpression(final Map> relevantColumns, - final ConceptTreeCache treeCache - ) { + private void mapRecordToConceptElements(final TreeConcept treeConcept, final Record record, final ConceptTreeCache treeCache) { final CalculatedValue> rowMap = new CalculatedValue<>(record::intoMap); final MatchingStats.Entry entry = toMatchingStatsEntry(record); @@ -300,53 +307,44 @@ private void mapRecordToConceptElements( return; } - relevantColumns.stream().map(field -> record.get(field, String.class)).forEach(relevantColumnValue -> { - try { - final ConceptTreeChild mostSpecificChild = treeCache.findMostSpecificChild(relevantColumnValue, rowMap); + try { + final String columnValue = record.get(CONNECTOR_COLUMN, String.class); - // database value did not match any node of the concept - if (mostSpecificChild == null) { - return; - } - - // add stats for most specific child - addEntryToConceptElement(mostSpecificChild, relevantColumnValue, entry); + final ConceptTreeChild mostSpecificChild = treeCache.findMostSpecificChild(columnValue, rowMap); - // add child stats to all parents till concept root - ConceptTreeNode current = mostSpecificChild.getParent(); - while (current != null) { - addEntryToConceptElement(current, relevantColumnValue, entry); - current = current.getParent(); - } + // database value did not match any node of the concept + if (mostSpecificChild == null) { + return; } - catch (ConceptConfigurationException e) { - throw new RuntimeException(e); + + // add child stats to all parents till concept root + ConceptTreeNode current = mostSpecificChild; + while (current != null) { + addEntryToConceptElement(current, columnValue, entry); + current = current.getParent(); } - }); + } + catch (ConceptConfigurationException e) { + throw new RuntimeException(e); + } } private MatchingStats.Entry toMatchingStatsEntry(Record record) { final long events = record.get(EVENTS, Integer.class).longValue(); final long entities = record.get(ENTITIES, Integer.class).longValue(); final CDateRange dateSpan = toDateRange(record.get(DATES, String.class)); + return new MatchingStats.Entry(events, entities, dateSpan.getMinValue(), dateSpan.getMaxValue()); } private CDateRange toDateRange(final String validityDateExpression) { final List dateRange = executionService.getResultSetProcessor().getCDateSetParser().toEpochDayRange(validityDateExpression); - return !dateRange.isEmpty() ? CDateRange.fromList(dateRange) : CDateRange.all(); - } - private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { - final MatchingStats childMatchingStats; - if (mostSpecificChild.getMatchingStats() == null) { - childMatchingStats = new MatchingStats(); - ((ConceptElement) mostSpecificChild).setMatchingStats(childMatchingStats); - } - else { - childMatchingStats = mostSpecificChild.getMatchingStats(); + if (dateRange.isEmpty()) { + return CDateRange.all(); } - childMatchingStats.putEntry(columnKey, entry); + + return CDateRange.fromList(dateRange); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java index 18d56960fe..ed2f58edc1 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/AndCondition.java @@ -4,16 +4,15 @@ import java.util.List; import java.util.Map; import java.util.Set; +import jakarta.validation.Valid; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; -import jakarta.validation.Valid; -import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; @@ -57,10 +56,10 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { + public Set getAuxillaryColumns() { final Set columns = new HashSet<>(); for (CTCondition ctCondition : conditions) { - columns.addAll(ctCondition.getColumns(connector)); + columns.addAll(ctCondition.getAuxillaryColumns()); } return columns; } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java index 2098e2995f..c77856c1b3 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java @@ -4,7 +4,6 @@ import java.util.Set; import com.bakdata.conquery.io.cps.CPSBase; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; @@ -26,6 +25,6 @@ default void init(ConceptTreeNode node) throws ConceptConfigurationException { WhereCondition convertToSqlCondition(CTConditionContext context); - Set getColumns(Connector connector); + Set getAuxillaryColumns(); } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java index 438dc1a046..a5adf79bd4 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/ColumnEqualCondition.java @@ -2,16 +2,15 @@ import java.util.Map; import java.util.Set; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.MultiSelectCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; import com.bakdata.conquery.util.CollectionsUtil; import com.fasterxml.jackson.annotation.JsonCreator; -import jakarta.validation.constraints.NotEmpty; import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Getter; @@ -56,7 +55,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { + public Set getAuxillaryColumns() { return Set.of(column); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java index 3942231400..de6ff96d60 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/EqualCondition.java @@ -1,17 +1,17 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; +import java.util.Collections; import java.util.Map; import java.util.Set; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.MultiSelectCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; import com.bakdata.conquery.util.CollectionsUtil; import com.fasterxml.jackson.annotation.JsonCreator; -import jakarta.validation.constraints.NotEmpty; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.Setter; @@ -47,7 +47,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { - return Set.of(connector.getColumn().getColumn()); + public Set getAuxillaryColumns() { + return Collections.emptySet(); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java index 2d314eb032..910a932c1b 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/GroovyCondition.java @@ -4,10 +4,10 @@ import java.util.Map; import java.util.Set; import java.util.stream.Stream; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.common.Range; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; @@ -16,7 +16,6 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import groovy.lang.GroovyShell; import groovy.lang.Script; -import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; import lombok.extern.slf4j.Slf4j; @@ -83,7 +82,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { + public Set getAuxillaryColumns() { return Set.of(); } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java index bef9540555..dd8848a0b6 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/IsPresentCondition.java @@ -4,7 +4,6 @@ import java.util.Set; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; @@ -39,7 +38,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { + public Set getAuxillaryColumns() { return Set.of(column); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java index 869be30283..01dcc3b6ab 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/NotCondition.java @@ -2,15 +2,14 @@ import java.util.Map; import java.util.Set; +import jakarta.validation.Valid; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; -import jakarta.validation.Valid; import lombok.Getter; import lombok.Setter; @@ -42,7 +41,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { - return condition.getColumns(connector); + public Set getAuxillaryColumns() { + return condition.getAuxillaryColumns(); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java index 270193e2e2..ef9bce878a 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/OrCondition.java @@ -4,16 +4,15 @@ import java.util.List; import java.util.Map; import java.util.Set; +import jakarta.validation.Valid; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeNode; import com.bakdata.conquery.models.exceptions.ConceptConfigurationException; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; -import jakarta.validation.Valid; -import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; @@ -57,10 +56,10 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { + public Set getAuxillaryColumns() { final Set columns = new HashSet<>(); for (CTCondition ctCondition : conditions) { - columns.addAll(ctCondition.getColumns(connector)); + columns.addAll(ctCondition.getAuxillaryColumns()); } return columns; } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java index 7ad362505f..5f53b98730 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixCondition.java @@ -1,18 +1,18 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; import java.util.Arrays; +import java.util.Collections; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.sql.conversion.model.filter.WhereConditionWrapper; import com.bakdata.conquery.util.CalculatedValue; -import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; import lombok.ToString; @@ -51,7 +51,7 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { - return Set.of(connector.getColumn().getColumn()); + public Set getAuxillaryColumns() { + return Collections.emptySet(); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java index 75143b6436..e24f094813 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/PrefixRangeCondition.java @@ -1,10 +1,11 @@ package com.bakdata.conquery.models.datasets.concepts.conditions; +import java.util.Collections; import java.util.Map; import java.util.Set; +import jakarta.validation.constraints.NotEmpty; import com.bakdata.conquery.io.cps.CPSType; -import com.bakdata.conquery.models.datasets.concepts.Connector; import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.dialect.SqlFunctionProvider; import com.bakdata.conquery.sql.conversion.model.filter.ConditionType; @@ -13,7 +14,6 @@ import com.bakdata.conquery.util.CalculatedValue; import com.fasterxml.jackson.annotation.JsonIgnore; import io.dropwizard.validation.ValidationMethod; -import jakarta.validation.constraints.NotEmpty; import lombok.Getter; import lombok.Setter; import org.jooq.Condition; @@ -64,8 +64,8 @@ public WhereCondition convertToSqlCondition(CTConditionContext context) { } @Override - public Set getColumns(Connector connector) { - return Set.of(connector.getColumn().getColumn()); + public Set getAuxillaryColumns() { + return Collections.emptySet(); } private String buildSqlRegexPattern(SqlFunctionProvider functionProvider) { From 6f4cd6a6613f7ff4ae851033476862b7ffc0d036 Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:33:50 +0100 Subject: [PATCH 26/31] ignore auxillary columns --- .../models/datasets/concepts/conditions/CTCondition.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java index c77856c1b3..f93e9e63ea 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java +++ b/backend/src/main/java/com/bakdata/conquery/models/datasets/concepts/conditions/CTCondition.java @@ -9,6 +9,7 @@ import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext; import com.bakdata.conquery.sql.conversion.model.filter.WhereCondition; import com.bakdata.conquery.util.CalculatedValue; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonTypeInfo; /** @@ -25,6 +26,7 @@ default void init(ConceptTreeNode node) throws ConceptConfigurationException { WhereCondition convertToSqlCondition(CTConditionContext context); + @JsonIgnore Set getAuxillaryColumns(); } From 98f5902a0c067d771f9e828d96b35af8fdd62ae4 Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Mon, 11 Nov 2024 11:05:16 +0100 Subject: [PATCH 27/31] dont min/max when ungrouped --- .../mode/local/UpdateMatchingStatsSqlJob.java | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index 6f5d2d1897..e344019e53 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -162,7 +162,8 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { .collect(Collectors.toList()); // if there is no validity date at all, we select no field - final Field validityDateExpression = validityDateMap.isEmpty() ? noField() : toValidityDateExpression(validityDateMap); + + final Field validityDateExpression = toValidityDateExpression(validityDateMap, !relevantColumns.isEmpty()); final SelectJoinStep query = dslContext.select(relevantColumnsAliased) .select( @@ -290,11 +291,24 @@ private Condition toJooqCondition(final Connector connector, CTCondition childCo /** * Select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors. */ - private Field toValidityDateExpression(final Map> validityDateMap) { + private Field toValidityDateExpression(final Map> validityDateMap, boolean grouped) { + if (validityDateMap.isEmpty()){ + return noField(String.class); + } + final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); - final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); + + final ColumnDateRange minAndMax; + + if (grouped){ + minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); + } + else { + minAndMax = ColumnDateRange.of(functionProvider.least(allStarts), functionProvider.greatest(allEnds)); + } + return functionProvider.daterangeStringExpression(minAndMax); } From 9ab227e22f4495c27d92ad7de22bf18d791147f0 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Tue, 12 Nov 2024 17:15:21 +0100 Subject: [PATCH 28/31] Fix least/greatest usage when there is only 1 validity date --- .../mode/local/UpdateMatchingStatsSqlJob.java | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index e344019e53..7261ee21d9 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -1,6 +1,13 @@ package com.bakdata.conquery.mode.local; -import static org.jooq.impl.DSL.*; +import static org.jooq.impl.DSL.asterisk; +import static org.jooq.impl.DSL.count; +import static org.jooq.impl.DSL.countDistinct; +import static org.jooq.impl.DSL.field; +import static org.jooq.impl.DSL.name; +import static org.jooq.impl.DSL.noCondition; +import static org.jooq.impl.DSL.noField; +import static org.jooq.impl.DSL.table; import java.sql.Date; import java.util.ArrayList; @@ -52,6 +59,7 @@ import org.jooq.Select; import org.jooq.SelectJoinStep; import org.jooq.Table; +import org.jooq.impl.DSL; @Slf4j public class UpdateMatchingStatsSqlJob extends Job { @@ -162,7 +170,6 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { .collect(Collectors.toList()); // if there is no validity date at all, we select no field - final Field validityDateExpression = toValidityDateExpression(validityDateMap, !relevantColumns.isEmpty()); final SelectJoinStep query = dslContext.select(relevantColumnsAliased) @@ -193,7 +200,7 @@ private Map>> collectRelevantColumns(final TreeConcept t )); } - private Set> collectRelevantColumns(final Connector connector, TreeConcept concept) { + private Set> collectRelevantColumns(final Connector connector, final TreeConcept concept) { final Set> out = new HashSet<>(); if (connector.getColumn() != null) { @@ -283,7 +290,7 @@ private Select createConnectorQuery( .where(connectorCondition); } - private Condition toJooqCondition(final Connector connector, CTCondition childCondition) { + private Condition toJooqCondition(final Connector connector, final CTCondition childCondition) { final CTConditionContext context = CTConditionContext.create(connector, functionProvider); return childCondition.convertToSqlCondition(context).condition(); } @@ -291,8 +298,9 @@ private Condition toJooqCondition(final Connector connector, CTCondition childCo /** * Select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors. */ - private Field toValidityDateExpression(final Map> validityDateMap, boolean grouped) { - if (validityDateMap.isEmpty()){ + private Field toValidityDateExpression(final Map> validityDateMap, final boolean grouped) { + + if (validityDateMap.isEmpty()) { return noField(String.class); } @@ -300,18 +308,28 @@ private Field toValidityDateExpression(final Map> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); - final ColumnDateRange minAndMax; - - if (grouped){ - minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); - } - else { - minAndMax = ColumnDateRange.of(functionProvider.least(allStarts), functionProvider.greatest(allEnds)); - } + final ColumnDateRange minAndMax = ColumnDateRange.of( + toAggregatedDateField(allStarts, functionProvider::least, DSL::min, grouped), + toAggregatedDateField(allEnds, functionProvider::greatest, DSL::max, grouped) + ); return functionProvider.daterangeStringExpression(minAndMax); } + private Field toAggregatedDateField( + final List> dates, + final Function>, Field> multiFieldFunction, + final Function, Field> aggregator, + final boolean grouped + ) { + // multi field function like LEAST() or GREATEST() is only applied if there are multiple fields + final Field field = dates.size() == 1 + ? dates.get(0) + : multiFieldFunction.apply(dates); + + return grouped ? aggregator.apply(field) : field; + } + private void mapRecordToConceptElements(final TreeConcept treeConcept, final Record record, final ConceptTreeCache treeCache) { final CalculatedValue> rowMap = new CalculatedValue<>(record::intoMap); final MatchingStats.Entry entry = toMatchingStatsEntry(record); From e794f679e8e2448f85efbb3025bf97fec29943b4 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Tue, 12 Nov 2024 17:19:52 +0100 Subject: [PATCH 29/31] Revert "Fix least/greatest usage when there is only 1 validity date" This reverts commit 9ab227e22f4495c27d92ad7de22bf18d791147f0. --- .../mode/local/UpdateMatchingStatsSqlJob.java | 46 ++++++------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index 7261ee21d9..e344019e53 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -1,13 +1,6 @@ package com.bakdata.conquery.mode.local; -import static org.jooq.impl.DSL.asterisk; -import static org.jooq.impl.DSL.count; -import static org.jooq.impl.DSL.countDistinct; -import static org.jooq.impl.DSL.field; -import static org.jooq.impl.DSL.name; -import static org.jooq.impl.DSL.noCondition; -import static org.jooq.impl.DSL.noField; -import static org.jooq.impl.DSL.table; +import static org.jooq.impl.DSL.*; import java.sql.Date; import java.util.ArrayList; @@ -59,7 +52,6 @@ import org.jooq.Select; import org.jooq.SelectJoinStep; import org.jooq.Table; -import org.jooq.impl.DSL; @Slf4j public class UpdateMatchingStatsSqlJob extends Job { @@ -170,6 +162,7 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { .collect(Collectors.toList()); // if there is no validity date at all, we select no field + final Field validityDateExpression = toValidityDateExpression(validityDateMap, !relevantColumns.isEmpty()); final SelectJoinStep query = dslContext.select(relevantColumnsAliased) @@ -200,7 +193,7 @@ private Map>> collectRelevantColumns(final TreeConcept t )); } - private Set> collectRelevantColumns(final Connector connector, final TreeConcept concept) { + private Set> collectRelevantColumns(final Connector connector, TreeConcept concept) { final Set> out = new HashSet<>(); if (connector.getColumn() != null) { @@ -290,7 +283,7 @@ private Select createConnectorQuery( .where(connectorCondition); } - private Condition toJooqCondition(final Connector connector, final CTCondition childCondition) { + private Condition toJooqCondition(final Connector connector, CTCondition childCondition) { final CTConditionContext context = CTConditionContext.create(connector, functionProvider); return childCondition.convertToSqlCondition(context).condition(); } @@ -298,9 +291,8 @@ private Condition toJooqCondition(final Connector connector, final CTCondition c /** * Select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors. */ - private Field toValidityDateExpression(final Map> validityDateMap, final boolean grouped) { - - if (validityDateMap.isEmpty()) { + private Field toValidityDateExpression(final Map> validityDateMap, boolean grouped) { + if (validityDateMap.isEmpty()){ return noField(String.class); } @@ -308,26 +300,16 @@ private Field toValidityDateExpression(final Map> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); - final ColumnDateRange minAndMax = ColumnDateRange.of( - toAggregatedDateField(allStarts, functionProvider::least, DSL::min, grouped), - toAggregatedDateField(allEnds, functionProvider::greatest, DSL::max, grouped) - ); - - return functionProvider.daterangeStringExpression(minAndMax); - } + final ColumnDateRange minAndMax; - private Field toAggregatedDateField( - final List> dates, - final Function>, Field> multiFieldFunction, - final Function, Field> aggregator, - final boolean grouped - ) { - // multi field function like LEAST() or GREATEST() is only applied if there are multiple fields - final Field field = dates.size() == 1 - ? dates.get(0) - : multiFieldFunction.apply(dates); + if (grouped){ + minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); + } + else { + minAndMax = ColumnDateRange.of(functionProvider.least(allStarts), functionProvider.greatest(allEnds)); + } - return grouped ? aggregator.apply(field) : field; + return functionProvider.daterangeStringExpression(minAndMax); } private void mapRecordToConceptElements(final TreeConcept treeConcept, final Record record, final ConceptTreeCache treeCache) { From 546b0e4ccd44ad1ae154cfc0ade5654653c5b206 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 13 Nov 2024 08:13:56 +0100 Subject: [PATCH 30/31] Revert "dont min/max when ungrouped" This reverts commit 98f5902a0c067d771f9e828d96b35af8fdd62ae4. --- .../mode/local/UpdateMatchingStatsSqlJob.java | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index e344019e53..6f5d2d1897 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -162,8 +162,7 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { .collect(Collectors.toList()); // if there is no validity date at all, we select no field - - final Field validityDateExpression = toValidityDateExpression(validityDateMap, !relevantColumns.isEmpty()); + final Field validityDateExpression = validityDateMap.isEmpty() ? noField() : toValidityDateExpression(validityDateMap); final SelectJoinStep query = dslContext.select(relevantColumnsAliased) .select( @@ -291,24 +290,11 @@ private Condition toJooqCondition(final Connector connector, CTCondition childCo /** * Select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors. */ - private Field toValidityDateExpression(final Map> validityDateMap, boolean grouped) { - if (validityDateMap.isEmpty()){ - return noField(String.class); - } - + private Field toValidityDateExpression(final Map> validityDateMap) { final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); - - final ColumnDateRange minAndMax; - - if (grouped){ - minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); - } - else { - minAndMax = ColumnDateRange.of(functionProvider.least(allStarts), functionProvider.greatest(allEnds)); - } - + final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); return functionProvider.daterangeStringExpression(minAndMax); } From 4c724a333b18c7d885982198f356c0bd17f3c9e6 Mon Sep 17 00:00:00 2001 From: Jonas Arnhold Date: Wed, 13 Nov 2024 09:50:38 +0100 Subject: [PATCH 31/31] Only use least/greatest for multiple validity dates --- .../mode/local/UpdateMatchingStatsSqlJob.java | 70 ++++++++++++------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java index 6f5d2d1897..959d69f980 100644 --- a/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java +++ b/backend/src/main/java/com/bakdata/conquery/mode/local/UpdateMatchingStatsSqlJob.java @@ -1,6 +1,15 @@ package com.bakdata.conquery.mode.local; -import static org.jooq.impl.DSL.*; +import static org.jooq.impl.DSL.asterisk; +import static org.jooq.impl.DSL.count; +import static org.jooq.impl.DSL.countDistinct; +import static org.jooq.impl.DSL.field; +import static org.jooq.impl.DSL.max; +import static org.jooq.impl.DSL.min; +import static org.jooq.impl.DSL.name; +import static org.jooq.impl.DSL.noCondition; +import static org.jooq.impl.DSL.noField; +import static org.jooq.impl.DSL.table; import java.sql.Date; import java.util.ArrayList; @@ -84,27 +93,6 @@ public UpdateMatchingStatsSqlJob( this.executors = MoreExecutors.listeningDecorator(executors); } - private static boolean isTreeConcept(final Concept concept) { - if (!(concept instanceof TreeConcept)) { - log.error("Collecting MatchingStats is currently only supported for TreeConcepts."); - return false; - } - return true; - } - - private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { - if (mostSpecificChild.getMatchingStats() == null) { - ((ConceptElement) mostSpecificChild).setMatchingStats(new MatchingStats()); - } - - mostSpecificChild.getMatchingStats().putEntry(columnKey, entry); - } - - @Override - public String getLabel() { - return "Calculating Matching Stats for %s.".formatted(executionService); - } - @Override public void execute() throws Exception { @@ -144,7 +132,28 @@ public void cancel() { super.cancel(); } - public void calculateMatchingStats(final TreeConcept treeConcept) { + @Override + public String getLabel() { + return "Calculating Matching Stats for %s.".formatted(executionService); + } + + private static boolean isTreeConcept(final Concept concept) { + if (!(concept instanceof TreeConcept)) { + log.error("Collecting MatchingStats is currently only supported for TreeConcepts."); + return false; + } + return true; + } + + private static void addEntryToConceptElement(final ConceptTreeNode mostSpecificChild, final String columnKey, final MatchingStats.Entry entry) { + if (mostSpecificChild.getMatchingStats() == null) { + ((ConceptElement) mostSpecificChild).setMatchingStats(new MatchingStats()); + } + + mostSpecificChild.getMatchingStats().putEntry(columnKey, entry); + } + + private void calculateMatchingStats(final TreeConcept treeConcept) { final Map>> relevantColumns = collectRelevantColumns(treeConcept); final Map> validityDateMap = createColumnDateRanges(treeConcept); @@ -161,8 +170,8 @@ public void calculateMatchingStats(final TreeConcept treeConcept) { .map(field -> field(field.getUnqualifiedName())) .collect(Collectors.toList()); - // if there is no validity date at all, we select no field - final Field validityDateExpression = validityDateMap.isEmpty() ? noField() : toValidityDateExpression(validityDateMap); + // if there is no validity date at all, no field is selected + final Field validityDateExpression = toValidityDateExpression(validityDateMap); final SelectJoinStep query = dslContext.select(relevantColumnsAliased) .select( @@ -291,10 +300,19 @@ private Condition toJooqCondition(final Connector connector, CTCondition childCo * Select the minimum of the least start date and the maximum of the greatest end date of all validity dates of all connectors. */ private Field toValidityDateExpression(final Map> validityDateMap) { + + if (validityDateMap.isEmpty()) { + return noField(String.class); + } + final List validityDates = validityDateMap.values().stream().flatMap(List::stream).map(functionProvider::toDualColumn).toList(); final List> allStarts = validityDates.stream().map(ColumnDateRange::getStart).toList(); final List> allEnds = validityDates.stream().map(ColumnDateRange::getEnd).toList(); - final ColumnDateRange minAndMax = ColumnDateRange.of(min(functionProvider.least(allStarts)), max(functionProvider.greatest((allEnds)))); + + final ColumnDateRange minAndMax = ColumnDateRange.of( + min(allStarts.size() > 1 ? functionProvider.least(allStarts) : allStarts.get(0)), + max(allEnds.size() > 1 ? functionProvider.greatest(allEnds) : allEnds.get(0)) + ); return functionProvider.daterangeStringExpression(minAndMax); }