From 6753bbd0e43231a908d902198eab2ff04f3a86b3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Jul 2024 12:00:35 +0200 Subject: [PATCH 01/22] Update databricks-labs-lsql requirement from ~=0.4.0 to >=0.4,<0.6 (#2076) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates the requirements on [databricks-labs-lsql](https://github.com/databrickslabs/lsql) to permit the latest version.
Release notes

Sourced from databricks-labs-lsql's releases.

v0.5.0

Dependency updates:

Contributors: @​JCZuurmond, @​nfx, @​dependabot[bot], @​nkvuong

Changelog

Sourced from databricks-labs-lsql's changelog.

0.5.0

Dependency updates:

0.4.3

Dependency updates:

0.4.2

0.4.1

0.4.0

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d21963b57d..2b95c7d04f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ classifiers = [ ] dependencies = ["databricks-sdk>=0.27,<0.30", - "databricks-labs-lsql~=0.4.0", + "databricks-labs-lsql>=0.4,<0.6", "databricks-labs-blueprint>=0.6.0", "PyYAML>=6.0.0,<7.0.0", "sqlglot>=25.4.1,<25.5", From dc6b712bb829e8c23b5e2b6def40fa98cfada28a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 4 Jul 2024 12:01:23 +0200 Subject: [PATCH 02/22] remove noise in whitelist (#2072) ## Changes remove noise in whitelist ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/known.json | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index ea55c7aad5..530ab6577c 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -1437,20 +1437,7 @@ "dbldatagen.function_builder": [], "dbldatagen.html_utils": [], "dbldatagen.nrange": [], - "dbldatagen.schema_parser": [ - { - "code": "table-migrate", - "message": "Can't migrate table_name argument in 'sparkSession.sql(f'create temporary table {table_name}{table_body}using csv')' because its value cannot be computed" - }, - { - "code": "table-migrate", - "message": "Can't migrate table_name argument in 'sparkSession.sql(f'drop table {table_name}')' because its value cannot be computed" - }, - { - "code": "table-migrate", - "message": "Can't migrate table_name argument in 'sparkSession.sql(f'select * from {table_name}')' because its value cannot be computed" - } - ], + "dbldatagen.schema_parser": [], "dbldatagen.spark_singleton": [], "dbldatagen.text_generator_plugins": [], "dbldatagen.text_generators": [], From 9ed54a42f0a7f1fe0f3522f56d289dcfa4e9777f Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 4 Jul 2024 12:02:39 +0200 Subject: [PATCH 03/22] whitelist-spacy (#1986) ## Changes whitelist-spacy ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested --------- Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 879 ++++++++++++++++++ 1 file changed, 879 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 530ab6577c..60f236e455 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -21909,6 +21909,885 @@ "soupsieve.pretty": [], "soupsieve.util": [] }, + "spacy": { + "spacy": [], + "spacy.about": [], + "spacy.cli": [], + "spacy.cli._util": [], + "spacy.cli.apply": [], + "spacy.cli.assemble": [], + "spacy.cli.benchmark_speed": [], + "spacy.cli.convert": [], + "spacy.cli.debug_config": [], + "spacy.cli.debug_data": [], + "spacy.cli.debug_diff": [], + "spacy.cli.debug_model": [], + "spacy.cli.download": [], + "spacy.cli.evaluate": [], + "spacy.cli.find_function": [], + "spacy.cli.find_threshold": [], + "spacy.cli.info": [], + "spacy.cli.init_config": [], + "spacy.cli.init_pipeline": [], + "spacy.cli.package": [], + "spacy.cli.pretrain": [], + "spacy.cli.profile": [], + "spacy.cli.project": [], + "spacy.cli.project.assets": [], + "spacy.cli.project.clone": [], + "spacy.cli.project.document": [], + "spacy.cli.project.dvc": [], + "spacy.cli.project.pull": [], + "spacy.cli.project.push": [], + "spacy.cli.project.remote_storage": [], + "spacy.cli.project.run": [], + "spacy.cli.train": [], + "spacy.cli.validate": [], + "spacy.compat": [], + "spacy.displacy": [], + "spacy.displacy.render": [], + "spacy.displacy.templates": [], + "spacy.errors": [], + "spacy.git_info": [], + "spacy.glossary": [], + "spacy.kb": [], + "spacy.lang": [], + "spacy.lang.af": [], + "spacy.lang.af.stop_words": [], + "spacy.lang.am": [], + "spacy.lang.am.examples": [], + "spacy.lang.am.lex_attrs": [], + "spacy.lang.am.punctuation": [], + "spacy.lang.am.stop_words": [], + "spacy.lang.am.tokenizer_exceptions": [], + "spacy.lang.ar": [], + "spacy.lang.ar.examples": [], + "spacy.lang.ar.lex_attrs": [], + "spacy.lang.ar.punctuation": [], + "spacy.lang.ar.stop_words": [], + "spacy.lang.ar.tokenizer_exceptions": [], + "spacy.lang.az": [], + "spacy.lang.az.examples": [], + "spacy.lang.az.lex_attrs": [], + "spacy.lang.az.stop_words": [], + "spacy.lang.bg": [], + "spacy.lang.bg.examples": [], + "spacy.lang.bg.lex_attrs": [], + "spacy.lang.bg.stop_words": [], + "spacy.lang.bg.tokenizer_exceptions": [], + "spacy.lang.bn": [], + "spacy.lang.bn.examples": [], + "spacy.lang.bn.punctuation": [], + "spacy.lang.bn.stop_words": [], + "spacy.lang.bn.tokenizer_exceptions": [], + "spacy.lang.ca": [], + "spacy.lang.ca.examples": [], + "spacy.lang.ca.lemmatizer": [], + "spacy.lang.ca.lex_attrs": [], + "spacy.lang.ca.punctuation": [], + "spacy.lang.ca.stop_words": [], + "spacy.lang.ca.syntax_iterators": [], + "spacy.lang.ca.tokenizer_exceptions": [], + "spacy.lang.char_classes": [], + "spacy.lang.cs": [], + "spacy.lang.cs.examples": [], + "spacy.lang.cs.lex_attrs": [], + "spacy.lang.cs.stop_words": [], + "spacy.lang.da": [], + "spacy.lang.da.examples": [], + "spacy.lang.da.lex_attrs": [], + "spacy.lang.da.punctuation": [], + "spacy.lang.da.stop_words": [], + "spacy.lang.da.syntax_iterators": [], + "spacy.lang.da.tokenizer_exceptions": [], + "spacy.lang.de": [], + "spacy.lang.de.examples": [], + "spacy.lang.de.punctuation": [], + "spacy.lang.de.stop_words": [], + "spacy.lang.de.syntax_iterators": [], + "spacy.lang.de.tokenizer_exceptions": [], + "spacy.lang.dsb": [], + "spacy.lang.dsb.examples": [], + "spacy.lang.dsb.lex_attrs": [], + "spacy.lang.dsb.stop_words": [], + "spacy.lang.el": [], + "spacy.lang.el.examples": [], + "spacy.lang.el.get_pos_from_wiktionary": [], + "spacy.lang.el.lemmatizer": [], + "spacy.lang.el.lex_attrs": [], + "spacy.lang.el.punctuation": [], + "spacy.lang.el.stop_words": [], + "spacy.lang.el.syntax_iterators": [], + "spacy.lang.el.tokenizer_exceptions": [], + "spacy.lang.en": [], + "spacy.lang.en.examples": [], + "spacy.lang.en.lemmatizer": [], + "spacy.lang.en.lex_attrs": [], + "spacy.lang.en.punctuation": [], + "spacy.lang.en.stop_words": [], + "spacy.lang.en.syntax_iterators": [], + "spacy.lang.en.tokenizer_exceptions": [], + "spacy.lang.es": [], + "spacy.lang.es.examples": [], + "spacy.lang.es.lemmatizer": [], + "spacy.lang.es.lex_attrs": [], + "spacy.lang.es.punctuation": [], + "spacy.lang.es.stop_words": [], + "spacy.lang.es.syntax_iterators": [], + "spacy.lang.es.tokenizer_exceptions": [], + "spacy.lang.et": [], + "spacy.lang.et.stop_words": [], + "spacy.lang.eu": [], + "spacy.lang.eu.examples": [], + "spacy.lang.eu.lex_attrs": [], + "spacy.lang.eu.punctuation": [], + "spacy.lang.eu.stop_words": [], + "spacy.lang.fa": [], + "spacy.lang.fa.examples": [], + "spacy.lang.fa.generate_verbs_exc": [], + "spacy.lang.fa.lex_attrs": [], + "spacy.lang.fa.punctuation": [], + "spacy.lang.fa.stop_words": [], + "spacy.lang.fa.syntax_iterators": [], + "spacy.lang.fa.tokenizer_exceptions": [], + "spacy.lang.fi": [], + "spacy.lang.fi.examples": [], + "spacy.lang.fi.lex_attrs": [], + "spacy.lang.fi.punctuation": [], + "spacy.lang.fi.stop_words": [], + "spacy.lang.fi.syntax_iterators": [], + "spacy.lang.fi.tokenizer_exceptions": [], + "spacy.lang.fo": [], + "spacy.lang.fo.tokenizer_exceptions": [], + "spacy.lang.fr": [], + "spacy.lang.fr._tokenizer_exceptions_list": [], + "spacy.lang.fr.examples": [], + "spacy.lang.fr.lemmatizer": [], + "spacy.lang.fr.lex_attrs": [], + "spacy.lang.fr.punctuation": [], + "spacy.lang.fr.stop_words": [], + "spacy.lang.fr.syntax_iterators": [], + "spacy.lang.fr.tokenizer_exceptions": [], + "spacy.lang.ga": [], + "spacy.lang.ga.lemmatizer": [], + "spacy.lang.ga.stop_words": [], + "spacy.lang.ga.tokenizer_exceptions": [], + "spacy.lang.grc": [], + "spacy.lang.grc.examples": [], + "spacy.lang.grc.lex_attrs": [], + "spacy.lang.grc.punctuation": [], + "spacy.lang.grc.stop_words": [], + "spacy.lang.grc.tokenizer_exceptions": [], + "spacy.lang.gu": [], + "spacy.lang.gu.examples": [], + "spacy.lang.gu.stop_words": [], + "spacy.lang.he": [], + "spacy.lang.he.examples": [], + "spacy.lang.he.lex_attrs": [], + "spacy.lang.he.stop_words": [], + "spacy.lang.hi": [], + "spacy.lang.hi.examples": [], + "spacy.lang.hi.lex_attrs": [], + "spacy.lang.hi.stop_words": [], + "spacy.lang.hr": [], + "spacy.lang.hr.examples": [], + "spacy.lang.hr.stop_words": [], + "spacy.lang.hsb": [], + "spacy.lang.hsb.examples": [], + "spacy.lang.hsb.lex_attrs": [], + "spacy.lang.hsb.stop_words": [], + "spacy.lang.hsb.tokenizer_exceptions": [], + "spacy.lang.hu": [], + "spacy.lang.hu.examples": [], + "spacy.lang.hu.punctuation": [], + "spacy.lang.hu.stop_words": [], + "spacy.lang.hu.tokenizer_exceptions": [], + "spacy.lang.hy": [], + "spacy.lang.hy.examples": [], + "spacy.lang.hy.lex_attrs": [], + "spacy.lang.hy.stop_words": [], + "spacy.lang.id": [], + "spacy.lang.id._tokenizer_exceptions_list": [], + "spacy.lang.id.examples": [], + "spacy.lang.id.lex_attrs": [], + "spacy.lang.id.punctuation": [], + "spacy.lang.id.stop_words": [], + "spacy.lang.id.syntax_iterators": [], + "spacy.lang.id.tokenizer_exceptions": [], + "spacy.lang.is": [], + "spacy.lang.is.stop_words": [], + "spacy.lang.it": [], + "spacy.lang.it.examples": [], + "spacy.lang.it.lemmatizer": [], + "spacy.lang.it.punctuation": [], + "spacy.lang.it.stop_words": [], + "spacy.lang.it.syntax_iterators": [], + "spacy.lang.it.tokenizer_exceptions": [], + "spacy.lang.ja": [], + "spacy.lang.ja.examples": [], + "spacy.lang.ja.stop_words": [], + "spacy.lang.ja.syntax_iterators": [], + "spacy.lang.ja.tag_bigram_map": [], + "spacy.lang.ja.tag_map": [], + "spacy.lang.ja.tag_orth_map": [], + "spacy.lang.kn": [], + "spacy.lang.kn.examples": [], + "spacy.lang.kn.stop_words": [], + "spacy.lang.ko": [], + "spacy.lang.ko.examples": [], + "spacy.lang.ko.lex_attrs": [], + "spacy.lang.ko.punctuation": [], + "spacy.lang.ko.stop_words": [], + "spacy.lang.ko.tag_map": [], + "spacy.lang.ky": [], + "spacy.lang.ky.examples": [], + "spacy.lang.ky.lex_attrs": [], + "spacy.lang.ky.punctuation": [], + "spacy.lang.ky.stop_words": [], + "spacy.lang.ky.tokenizer_exceptions": [], + "spacy.lang.la": [], + "spacy.lang.la.examples": [], + "spacy.lang.la.lex_attrs": [], + "spacy.lang.la.stop_words": [], + "spacy.lang.la.syntax_iterators": [], + "spacy.lang.la.tokenizer_exceptions": [], + "spacy.lang.lb": [], + "spacy.lang.lb.examples": [], + "spacy.lang.lb.lex_attrs": [], + "spacy.lang.lb.punctuation": [], + "spacy.lang.lb.stop_words": [], + "spacy.lang.lb.tokenizer_exceptions": [], + "spacy.lang.lex_attrs": [], + "spacy.lang.lg": [], + "spacy.lang.lg.examples": [], + "spacy.lang.lg.lex_attrs": [], + "spacy.lang.lg.punctuation": [], + "spacy.lang.lg.stop_words": [], + "spacy.lang.lij": [], + "spacy.lang.lij.examples": [], + "spacy.lang.lij.punctuation": [], + "spacy.lang.lij.stop_words": [], + "spacy.lang.lij.tokenizer_exceptions": [], + "spacy.lang.lt": [], + "spacy.lang.lt.examples": [], + "spacy.lang.lt.lex_attrs": [], + "spacy.lang.lt.punctuation": [], + "spacy.lang.lt.stop_words": [], + "spacy.lang.lt.tokenizer_exceptions": [], + "spacy.lang.lv": [], + "spacy.lang.lv.stop_words": [], + "spacy.lang.mk": [], + "spacy.lang.mk.lemmatizer": [], + "spacy.lang.mk.lex_attrs": [], + "spacy.lang.mk.stop_words": [], + "spacy.lang.mk.tokenizer_exceptions": [], + "spacy.lang.ml": [], + "spacy.lang.ml.examples": [], + "spacy.lang.ml.lex_attrs": [], + "spacy.lang.ml.stop_words": [], + "spacy.lang.mr": [], + "spacy.lang.mr.stop_words": [], + "spacy.lang.ms": [], + "spacy.lang.ms._tokenizer_exceptions_list": [], + "spacy.lang.ms.examples": [], + "spacy.lang.ms.lex_attrs": [], + "spacy.lang.ms.punctuation": [], + "spacy.lang.ms.stop_words": [], + "spacy.lang.ms.syntax_iterators": [], + "spacy.lang.ms.tokenizer_exceptions": [], + "spacy.lang.nb": [], + "spacy.lang.nb.examples": [], + "spacy.lang.nb.punctuation": [], + "spacy.lang.nb.stop_words": [], + "spacy.lang.nb.syntax_iterators": [], + "spacy.lang.nb.tokenizer_exceptions": [], + "spacy.lang.ne": [], + "spacy.lang.ne.examples": [], + "spacy.lang.ne.lex_attrs": [], + "spacy.lang.ne.stop_words": [], + "spacy.lang.nl": [], + "spacy.lang.nl.examples": [], + "spacy.lang.nl.lemmatizer": [], + "spacy.lang.nl.lex_attrs": [], + "spacy.lang.nl.punctuation": [], + "spacy.lang.nl.stop_words": [], + "spacy.lang.nl.syntax_iterators": [], + "spacy.lang.nl.tokenizer_exceptions": [], + "spacy.lang.nn": [], + "spacy.lang.nn.examples": [], + "spacy.lang.nn.punctuation": [], + "spacy.lang.nn.tokenizer_exceptions": [], + "spacy.lang.norm_exceptions": [], + "spacy.lang.pl": [], + "spacy.lang.pl.examples": [], + "spacy.lang.pl.lemmatizer": [], + "spacy.lang.pl.lex_attrs": [], + "spacy.lang.pl.punctuation": [], + "spacy.lang.pl.stop_words": [], + "spacy.lang.pt": [], + "spacy.lang.pt.examples": [], + "spacy.lang.pt.lex_attrs": [], + "spacy.lang.pt.punctuation": [], + "spacy.lang.pt.stop_words": [], + "spacy.lang.pt.syntax_iterators": [], + "spacy.lang.pt.tokenizer_exceptions": [], + "spacy.lang.punctuation": [], + "spacy.lang.ro": [], + "spacy.lang.ro.examples": [], + "spacy.lang.ro.lex_attrs": [], + "spacy.lang.ro.punctuation": [], + "spacy.lang.ro.stop_words": [], + "spacy.lang.ro.tokenizer_exceptions": [], + "spacy.lang.ru": [], + "spacy.lang.ru.examples": [], + "spacy.lang.ru.lemmatizer": [], + "spacy.lang.ru.lex_attrs": [], + "spacy.lang.ru.stop_words": [], + "spacy.lang.ru.tokenizer_exceptions": [], + "spacy.lang.sa": [], + "spacy.lang.sa.examples": [], + "spacy.lang.sa.lex_attrs": [], + "spacy.lang.sa.stop_words": [], + "spacy.lang.si": [], + "spacy.lang.si.examples": [], + "spacy.lang.si.lex_attrs": [], + "spacy.lang.si.stop_words": [], + "spacy.lang.sk": [], + "spacy.lang.sk.examples": [], + "spacy.lang.sk.lex_attrs": [], + "spacy.lang.sk.stop_words": [], + "spacy.lang.sl": [], + "spacy.lang.sl.examples": [], + "spacy.lang.sl.lex_attrs": [], + "spacy.lang.sl.punctuation": [], + "spacy.lang.sl.stop_words": [], + "spacy.lang.sl.tokenizer_exceptions": [], + "spacy.lang.sq": [], + "spacy.lang.sq.examples": [], + "spacy.lang.sq.stop_words": [], + "spacy.lang.sr": [], + "spacy.lang.sr.examples": [], + "spacy.lang.sr.lex_attrs": [], + "spacy.lang.sr.punctuation": [], + "spacy.lang.sr.stop_words": [], + "spacy.lang.sr.tokenizer_exceptions": [], + "spacy.lang.sv": [], + "spacy.lang.sv.examples": [], + "spacy.lang.sv.lex_attrs": [], + "spacy.lang.sv.punctuation": [], + "spacy.lang.sv.stop_words": [], + "spacy.lang.sv.syntax_iterators": [], + "spacy.lang.sv.tokenizer_exceptions": [], + "spacy.lang.ta": [], + "spacy.lang.ta.examples": [], + "spacy.lang.ta.lex_attrs": [], + "spacy.lang.ta.stop_words": [], + "spacy.lang.te": [], + "spacy.lang.te.examples": [], + "spacy.lang.te.lex_attrs": [], + "spacy.lang.te.stop_words": [], + "spacy.lang.th": [], + "spacy.lang.th.lex_attrs": [], + "spacy.lang.th.stop_words": [], + "spacy.lang.th.tokenizer_exceptions": [], + "spacy.lang.ti": [], + "spacy.lang.ti.examples": [], + "spacy.lang.ti.lex_attrs": [], + "spacy.lang.ti.punctuation": [], + "spacy.lang.ti.stop_words": [], + "spacy.lang.ti.tokenizer_exceptions": [], + "spacy.lang.tl": [], + "spacy.lang.tl.lex_attrs": [], + "spacy.lang.tl.stop_words": [], + "spacy.lang.tl.tokenizer_exceptions": [], + "spacy.lang.tn": [], + "spacy.lang.tn.examples": [], + "spacy.lang.tn.lex_attrs": [], + "spacy.lang.tn.punctuation": [], + "spacy.lang.tn.stop_words": [], + "spacy.lang.tokenizer_exceptions": [], + "spacy.lang.tr": [], + "spacy.lang.tr.examples": [], + "spacy.lang.tr.lex_attrs": [], + "spacy.lang.tr.stop_words": [], + "spacy.lang.tr.syntax_iterators": [], + "spacy.lang.tr.tokenizer_exceptions": [], + "spacy.lang.tt": [], + "spacy.lang.tt.examples": [], + "spacy.lang.tt.lex_attrs": [], + "spacy.lang.tt.punctuation": [], + "spacy.lang.tt.stop_words": [], + "spacy.lang.tt.tokenizer_exceptions": [], + "spacy.lang.uk": [], + "spacy.lang.uk.examples": [], + "spacy.lang.uk.lemmatizer": [], + "spacy.lang.uk.lex_attrs": [], + "spacy.lang.uk.stop_words": [], + "spacy.lang.uk.tokenizer_exceptions": [], + "spacy.lang.ur": [], + "spacy.lang.ur.examples": [], + "spacy.lang.ur.lex_attrs": [], + "spacy.lang.ur.punctuation": [], + "spacy.lang.ur.stop_words": [], + "spacy.lang.vi": [], + "spacy.lang.vi.examples": [], + "spacy.lang.vi.lex_attrs": [], + "spacy.lang.vi.stop_words": [], + "spacy.lang.xx": [], + "spacy.lang.xx.examples": [], + "spacy.lang.yo": [], + "spacy.lang.yo.examples": [], + "spacy.lang.yo.lex_attrs": [], + "spacy.lang.yo.stop_words": [], + "spacy.lang.zh": [], + "spacy.lang.zh.examples": [], + "spacy.lang.zh.lex_attrs": [], + "spacy.lang.zh.stop_words": [], + "spacy.language": [], + "spacy.lookups": [], + "spacy.matcher": [], + "spacy.ml": [], + "spacy.ml._character_embed": [], + "spacy.ml._precomputable_affine": [], + "spacy.ml.callbacks": [], + "spacy.ml.extract_ngrams": [], + "spacy.ml.extract_spans": [], + "spacy.ml.featureextractor": [], + "spacy.ml.models": [], + "spacy.ml.models.entity_linker": [], + "spacy.ml.models.multi_task": [], + "spacy.ml.models.parser": [], + "spacy.ml.models.span_finder": [], + "spacy.ml.models.spancat": [], + "spacy.ml.models.tagger": [], + "spacy.ml.models.textcat": [], + "spacy.ml.models.tok2vec": [], + "spacy.ml.staticvectors": [], + "spacy.ml.tb_framework": [], + "spacy.pipe_analysis": [], + "spacy.pipeline": [], + "spacy.pipeline._edit_tree_internals": [], + "spacy.pipeline._edit_tree_internals.schemas": [], + "spacy.pipeline._parser_internals": [], + "spacy.pipeline.attributeruler": [], + "spacy.pipeline.edit_tree_lemmatizer": [], + "spacy.pipeline.entity_linker": [], + "spacy.pipeline.entityruler": [], + "spacy.pipeline.functions": [], + "spacy.pipeline.legacy": [], + "spacy.pipeline.legacy.entity_linker": [], + "spacy.pipeline.lemmatizer": [], + "spacy.pipeline.span_finder": [], + "spacy.pipeline.span_ruler": [], + "spacy.pipeline.spancat": [], + "spacy.pipeline.textcat": [], + "spacy.pipeline.textcat_multilabel": [], + "spacy.pipeline.tok2vec": [], + "spacy.schemas": [], + "spacy.scorer": [], + "spacy.tests": [], + "spacy.tests.conftest": [], + "spacy.tests.doc": [], + "spacy.tests.doc.test_add_entities": [], + "spacy.tests.doc.test_array": [], + "spacy.tests.doc.test_creation": [], + "spacy.tests.doc.test_doc_api": [], + "spacy.tests.doc.test_graph": [], + "spacy.tests.doc.test_json_doc_conversion": [], + "spacy.tests.doc.test_morphanalysis": [], + "spacy.tests.doc.test_pickle_doc": [], + "spacy.tests.doc.test_retokenize_merge": [], + "spacy.tests.doc.test_retokenize_split": [], + "spacy.tests.doc.test_span": [], + "spacy.tests.doc.test_span_group": [], + "spacy.tests.doc.test_token_api": [], + "spacy.tests.doc.test_underscore": [], + "spacy.tests.enable_gpu": [], + "spacy.tests.lang": [], + "spacy.tests.lang.af": [], + "spacy.tests.lang.af.test_text": [], + "spacy.tests.lang.af.test_tokenizer": [], + "spacy.tests.lang.am": [], + "spacy.tests.lang.am.test_exception": [], + "spacy.tests.lang.am.test_text": [], + "spacy.tests.lang.ar": [], + "spacy.tests.lang.ar.test_exceptions": [], + "spacy.tests.lang.ar.test_text": [], + "spacy.tests.lang.bn": [], + "spacy.tests.lang.bn.test_tokenizer": [], + "spacy.tests.lang.ca": [], + "spacy.tests.lang.ca.test_exception": [], + "spacy.tests.lang.ca.test_prefix_suffix_infix": [], + "spacy.tests.lang.ca.test_text": [], + "spacy.tests.lang.cs": [], + "spacy.tests.lang.cs.test_text": [], + "spacy.tests.lang.da": [], + "spacy.tests.lang.da.test_exceptions": [], + "spacy.tests.lang.da.test_noun_chunks": [], + "spacy.tests.lang.da.test_prefix_suffix_infix": [], + "spacy.tests.lang.da.test_text": [], + "spacy.tests.lang.de": [], + "spacy.tests.lang.de.test_exceptions": [], + "spacy.tests.lang.de.test_noun_chunks": [], + "spacy.tests.lang.de.test_parser": [], + "spacy.tests.lang.de.test_prefix_suffix_infix": [], + "spacy.tests.lang.de.test_text": [], + "spacy.tests.lang.dsb": [], + "spacy.tests.lang.dsb.test_text": [], + "spacy.tests.lang.dsb.test_tokenizer": [], + "spacy.tests.lang.el": [], + "spacy.tests.lang.el.test_exception": [], + "spacy.tests.lang.el.test_noun_chunks": [], + "spacy.tests.lang.el.test_text": [], + "spacy.tests.lang.en": [], + "spacy.tests.lang.en.test_customized_tokenizer": [], + "spacy.tests.lang.en.test_exceptions": [], + "spacy.tests.lang.en.test_indices": [], + "spacy.tests.lang.en.test_noun_chunks": [], + "spacy.tests.lang.en.test_parser": [], + "spacy.tests.lang.en.test_prefix_suffix_infix": [], + "spacy.tests.lang.en.test_punct": [], + "spacy.tests.lang.en.test_sbd": [], + "spacy.tests.lang.en.test_text": [], + "spacy.tests.lang.en.test_tokenizer": [], + "spacy.tests.lang.es": [], + "spacy.tests.lang.es.test_exception": [], + "spacy.tests.lang.es.test_noun_chunks": [], + "spacy.tests.lang.es.test_text": [], + "spacy.tests.lang.et": [], + "spacy.tests.lang.et.test_text": [], + "spacy.tests.lang.et.test_tokenizer": [], + "spacy.tests.lang.eu": [], + "spacy.tests.lang.eu.test_text": [], + "spacy.tests.lang.fa": [], + "spacy.tests.lang.fa.test_noun_chunks": [], + "spacy.tests.lang.fi": [], + "spacy.tests.lang.fi.test_noun_chunks": [], + "spacy.tests.lang.fi.test_text": [], + "spacy.tests.lang.fi.test_tokenizer": [], + "spacy.tests.lang.fo": [], + "spacy.tests.lang.fo.test_tokenizer": [], + "spacy.tests.lang.fr": [], + "spacy.tests.lang.fr.test_exceptions": [], + "spacy.tests.lang.fr.test_noun_chunks": [], + "spacy.tests.lang.fr.test_prefix_suffix_infix": [], + "spacy.tests.lang.fr.test_text": [], + "spacy.tests.lang.ga": [], + "spacy.tests.lang.ga.test_tokenizer": [], + "spacy.tests.lang.grc": [], + "spacy.tests.lang.grc.test_text": [], + "spacy.tests.lang.grc.test_tokenizer": [], + "spacy.tests.lang.gu": [], + "spacy.tests.lang.gu.test_text": [], + "spacy.tests.lang.he": [], + "spacy.tests.lang.he.test_tokenizer": [], + "spacy.tests.lang.hi": [], + "spacy.tests.lang.hi.test_lex_attrs": [], + "spacy.tests.lang.hi.test_text": [], + "spacy.tests.lang.hr": [], + "spacy.tests.lang.hr.test_text": [], + "spacy.tests.lang.hr.test_tokenizer": [], + "spacy.tests.lang.hsb": [], + "spacy.tests.lang.hsb.test_text": [], + "spacy.tests.lang.hsb.test_tokenizer": [], + "spacy.tests.lang.hu": [], + "spacy.tests.lang.hu.test_tokenizer": [], + "spacy.tests.lang.hy": [], + "spacy.tests.lang.hy.test_text": [], + "spacy.tests.lang.hy.test_tokenizer": [], + "spacy.tests.lang.id": [], + "spacy.tests.lang.id.test_noun_chunks": [], + "spacy.tests.lang.id.test_prefix_suffix_infix": [], + "spacy.tests.lang.id.test_text": [], + "spacy.tests.lang.is": [], + "spacy.tests.lang.is.test_text": [], + "spacy.tests.lang.is.test_tokenizer": [], + "spacy.tests.lang.it": [], + "spacy.tests.lang.it.test_noun_chunks": [], + "spacy.tests.lang.it.test_prefix_suffix_infix": [], + "spacy.tests.lang.it.test_stopwords": [], + "spacy.tests.lang.it.test_text": [], + "spacy.tests.lang.ja": [], + "spacy.tests.lang.ja.test_lemmatization": [], + "spacy.tests.lang.ja.test_morphologizer_factory": [], + "spacy.tests.lang.ja.test_serialize": [], + "spacy.tests.lang.ja.test_tokenizer": [], + "spacy.tests.lang.ko": [], + "spacy.tests.lang.ko.test_lemmatization": [], + "spacy.tests.lang.ko.test_serialize": [], + "spacy.tests.lang.ko.test_tokenizer": [], + "spacy.tests.lang.ky": [], + "spacy.tests.lang.ky.test_tokenizer": [], + "spacy.tests.lang.la": [], + "spacy.tests.lang.la.test_exception": [], + "spacy.tests.lang.la.test_noun_chunks": [], + "spacy.tests.lang.la.test_text": [], + "spacy.tests.lang.lb": [], + "spacy.tests.lang.lb.test_exceptions": [], + "spacy.tests.lang.lb.test_prefix_suffix_infix": [], + "spacy.tests.lang.lb.test_text": [], + "spacy.tests.lang.lg": [], + "spacy.tests.lang.lg.test_tokenizer": [], + "spacy.tests.lang.lt": [], + "spacy.tests.lang.lt.test_text": [], + "spacy.tests.lang.lv": [], + "spacy.tests.lang.lv.test_text": [], + "spacy.tests.lang.lv.test_tokenizer": [], + "spacy.tests.lang.mk": [], + "spacy.tests.lang.mk.test_text": [], + "spacy.tests.lang.ml": [], + "spacy.tests.lang.ml.test_text": [], + "spacy.tests.lang.ms": [], + "spacy.tests.lang.ms.test_noun_chunks": [], + "spacy.tests.lang.ms.test_prefix_suffix_infix": [], + "spacy.tests.lang.ms.test_text": [], + "spacy.tests.lang.nb": [], + "spacy.tests.lang.nb.test_noun_chunks": [], + "spacy.tests.lang.nb.test_tokenizer": [], + "spacy.tests.lang.ne": [], + "spacy.tests.lang.ne.test_text": [], + "spacy.tests.lang.nl": [], + "spacy.tests.lang.nl.test_noun_chunks": [], + "spacy.tests.lang.nl.test_text": [], + "spacy.tests.lang.nn": [], + "spacy.tests.lang.nn.test_tokenizer": [], + "spacy.tests.lang.pl": [], + "spacy.tests.lang.pl.test_text": [], + "spacy.tests.lang.pl.test_tokenizer": [], + "spacy.tests.lang.pt": [], + "spacy.tests.lang.pt.test_noun_chunks": [], + "spacy.tests.lang.pt.test_text": [], + "spacy.tests.lang.ro": [], + "spacy.tests.lang.ro.test_tokenizer": [], + "spacy.tests.lang.ru": [], + "spacy.tests.lang.ru.test_exceptions": [], + "spacy.tests.lang.ru.test_lemmatizer": [], + "spacy.tests.lang.ru.test_text": [], + "spacy.tests.lang.ru.test_tokenizer": [], + "spacy.tests.lang.sa": [], + "spacy.tests.lang.sa.test_text": [], + "spacy.tests.lang.sk": [], + "spacy.tests.lang.sk.test_text": [], + "spacy.tests.lang.sk.test_tokenizer": [], + "spacy.tests.lang.sl": [], + "spacy.tests.lang.sl.test_text": [], + "spacy.tests.lang.sl.test_tokenizer": [], + "spacy.tests.lang.sq": [], + "spacy.tests.lang.sq.test_text": [], + "spacy.tests.lang.sq.test_tokenizer": [], + "spacy.tests.lang.sr": [], + "spacy.tests.lang.sr.test_exceptions": [], + "spacy.tests.lang.sr.test_tokenizer": [], + "spacy.tests.lang.sv": [], + "spacy.tests.lang.sv.test_exceptions": [], + "spacy.tests.lang.sv.test_lex_attrs": [], + "spacy.tests.lang.sv.test_noun_chunks": [], + "spacy.tests.lang.sv.test_prefix_suffix_infix": [], + "spacy.tests.lang.sv.test_text": [], + "spacy.tests.lang.sv.test_tokenizer": [], + "spacy.tests.lang.ta": [], + "spacy.tests.lang.ta.test_text": [], + "spacy.tests.lang.ta.test_tokenizer": [], + "spacy.tests.lang.test_attrs": [], + "spacy.tests.lang.test_initialize": [], + "spacy.tests.lang.test_lemmatizers": [], + "spacy.tests.lang.th": [], + "spacy.tests.lang.th.test_serialize": [], + "spacy.tests.lang.th.test_tokenizer": [], + "spacy.tests.lang.ti": [], + "spacy.tests.lang.ti.test_exception": [], + "spacy.tests.lang.ti.test_text": [], + "spacy.tests.lang.tl": [], + "spacy.tests.lang.tl.test_indices": [], + "spacy.tests.lang.tl.test_punct": [], + "spacy.tests.lang.tl.test_text": [], + "spacy.tests.lang.tr": [], + "spacy.tests.lang.tr.test_noun_chunks": [], + "spacy.tests.lang.tr.test_parser": [], + "spacy.tests.lang.tr.test_text": [], + "spacy.tests.lang.tr.test_tokenizer": [], + "spacy.tests.lang.tt": [], + "spacy.tests.lang.tt.test_tokenizer": [], + "spacy.tests.lang.uk": [], + "spacy.tests.lang.uk.test_lemmatizer": [], + "spacy.tests.lang.uk.test_tokenizer": [], + "spacy.tests.lang.uk.test_tokenizer_exc": [], + "spacy.tests.lang.ur": [], + "spacy.tests.lang.ur.test_prefix_suffix_infix": [], + "spacy.tests.lang.ur.test_text": [], + "spacy.tests.lang.vi": [], + "spacy.tests.lang.vi.test_serialize": [], + "spacy.tests.lang.vi.test_tokenizer": [], + "spacy.tests.lang.xx": [], + "spacy.tests.lang.xx.test_text": [], + "spacy.tests.lang.xx.test_tokenizer": [], + "spacy.tests.lang.yo": [], + "spacy.tests.lang.yo.test_text": [], + "spacy.tests.lang.zh": [], + "spacy.tests.lang.zh.test_serialize": [], + "spacy.tests.lang.zh.test_text": [], + "spacy.tests.lang.zh.test_tokenizer": [], + "spacy.tests.matcher": [], + "spacy.tests.matcher.test_dependency_matcher": [], + "spacy.tests.matcher.test_levenshtein": [], + "spacy.tests.matcher.test_matcher_api": [], + "spacy.tests.matcher.test_matcher_logic": [], + "spacy.tests.matcher.test_pattern_validation": [], + "spacy.tests.matcher.test_phrase_matcher": [], + "spacy.tests.morphology": [], + "spacy.tests.morphology.test_morph_converters": [], + "spacy.tests.morphology.test_morph_features": [], + "spacy.tests.morphology.test_morph_pickle": [], + "spacy.tests.package": [], + "spacy.tests.package.test_requirements": [], + "spacy.tests.parser": [], + "spacy.tests.parser.test_add_label": [], + "spacy.tests.parser.test_arc_eager_oracle": [], + "spacy.tests.parser.test_ner": [], + "spacy.tests.parser.test_neural_parser": [], + "spacy.tests.parser.test_nn_beam": [], + "spacy.tests.parser.test_nonproj": [], + "spacy.tests.parser.test_parse": [], + "spacy.tests.parser.test_parse_navigate": [], + "spacy.tests.parser.test_preset_sbd": [], + "spacy.tests.parser.test_space_attachment": [], + "spacy.tests.parser.test_state": [], + "spacy.tests.pipeline": [], + "spacy.tests.pipeline.test_analysis": [], + "spacy.tests.pipeline.test_annotates_on_update": [], + "spacy.tests.pipeline.test_attributeruler": [], + "spacy.tests.pipeline.test_edit_tree_lemmatizer": [], + "spacy.tests.pipeline.test_entity_linker": [], + "spacy.tests.pipeline.test_entity_ruler": [], + "spacy.tests.pipeline.test_functions": [], + "spacy.tests.pipeline.test_initialize": [], + "spacy.tests.pipeline.test_lemmatizer": [], + "spacy.tests.pipeline.test_models": [], + "spacy.tests.pipeline.test_morphologizer": [], + "spacy.tests.pipeline.test_pipe_factories": [], + "spacy.tests.pipeline.test_pipe_methods": [], + "spacy.tests.pipeline.test_sentencizer": [], + "spacy.tests.pipeline.test_senter": [], + "spacy.tests.pipeline.test_span_finder": [], + "spacy.tests.pipeline.test_span_ruler": [], + "spacy.tests.pipeline.test_spancat": [], + "spacy.tests.pipeline.test_tagger": [], + "spacy.tests.pipeline.test_textcat": [], + "spacy.tests.pipeline.test_tok2vec": [], + "spacy.tests.serialize": [], + "spacy.tests.serialize.test_resource_warning": [], + "spacy.tests.serialize.test_serialize_config": [], + "spacy.tests.serialize.test_serialize_doc": [], + "spacy.tests.serialize.test_serialize_docbin": [], + "spacy.tests.serialize.test_serialize_extension_attrs": [], + "spacy.tests.serialize.test_serialize_kb": [], + "spacy.tests.serialize.test_serialize_language": [], + "spacy.tests.serialize.test_serialize_pipeline": [], + "spacy.tests.serialize.test_serialize_span_groups": [], + "spacy.tests.serialize.test_serialize_tokenizer": [], + "spacy.tests.serialize.test_serialize_vocab_strings": [], + "spacy.tests.test_architectures": [], + "spacy.tests.test_cli": [], + "spacy.tests.test_cli_app": [], + "spacy.tests.test_displacy": [], + "spacy.tests.test_errors": [], + "spacy.tests.test_language": [], + "spacy.tests.test_misc": [], + "spacy.tests.test_models": [], + "spacy.tests.test_pickles": [], + "spacy.tests.test_scorer": [], + "spacy.tests.test_ty": [], + "spacy.tests.tok2vec": [], + "spacy.tests.tokenizer": [], + "spacy.tests.tokenizer.test_exceptions": [], + "spacy.tests.tokenizer.test_explain": [], + "spacy.tests.tokenizer.test_naughty_strings": [], + "spacy.tests.tokenizer.test_tokenizer": [], + "spacy.tests.tokenizer.test_urls": [], + "spacy.tests.tokenizer.test_whitespace": [], + "spacy.tests.training": [], + "spacy.tests.training.test_augmenters": [], + "spacy.tests.training.test_corpus": [], + "spacy.tests.training.test_logger": [], + "spacy.tests.training.test_new_example": [], + "spacy.tests.training.test_pretraining": [], + "spacy.tests.training.test_readers": [], + "spacy.tests.training.test_rehearse": [], + "spacy.tests.training.test_training": [], + "spacy.tests.util": [], + "spacy.tests.vocab_vectors": [], + "spacy.tests.vocab_vectors.test_lexeme": [], + "spacy.tests.vocab_vectors.test_lookups": [], + "spacy.tests.vocab_vectors.test_similarity": [], + "spacy.tests.vocab_vectors.test_stringstore": [], + "spacy.tests.vocab_vectors.test_vectors": [], + "spacy.tests.vocab_vectors.test_vocab_api": [], + "spacy.tokens": [], + "spacy.tokens._dict_proxies": [], + "spacy.tokens._serialize": [], + "spacy.tokens.underscore": [], + "spacy.training": [], + "spacy.training.alignment": [], + "spacy.training.augment": [], + "spacy.training.batchers": [], + "spacy.training.callbacks": [], + "spacy.training.converters": [], + "spacy.training.converters.conll_ner_to_docs": [], + "spacy.training.converters.conllu_to_docs": [], + "spacy.training.converters.iob_to_docs": [], + "spacy.training.converters.json_to_docs": [], + "spacy.training.corpus": [], + "spacy.training.initialize": [], + "spacy.training.iob_utils": [], + "spacy.training.loggers": [], + "spacy.training.loop": [], + "spacy.training.pretrain": [], + "spacy.ty": [], + "spacy.util": [] + }, + "spacy-legacy": { + "spacy_legacy": [], + "spacy_legacy.architectures": [], + "spacy_legacy.architectures.entity_linker": [], + "spacy_legacy.architectures.parser": [], + "spacy_legacy.architectures.tagger": [], + "spacy_legacy.architectures.textcat": [], + "spacy_legacy.architectures.tok2vec": [], + "spacy_legacy.components": [], + "spacy_legacy.components.entity_linker": [], + "spacy_legacy.layers": [], + "spacy_legacy.layers.staticvectors_v1": [], + "spacy_legacy.loggers": [], + "spacy_legacy.scorers": [], + "spacy_legacy.tests": [], + "spacy_legacy.tests.parser": [], + "spacy_legacy.tests.parser.test_parser": [], + "spacy_legacy.tests.pipeline": [], + "spacy_legacy.tests.pipeline.test_tagger": [], + "spacy_legacy.tests.pipeline.test_textcat": [], + "spacy_legacy.tests.pipeline.test_tok2vec": [], + "spacy_legacy.tests.test_layer": [], + "spacy_legacy.tests.test_legacy": [], + "spacy_legacy.tests.test_logger": [], + "spacy_legacy.tests.test_scorers": [] + }, + "spacy-loggers": { + "spacy_loggers": [], + "spacy_loggers.chain": [], + "spacy_loggers.clearml": [], + "spacy_loggers.cupy": [], + "spacy_loggers.lookup": [], + "spacy_loggers.mlflow": [], + "spacy_loggers.pytorch": [], + "spacy_loggers.tests": [], + "spacy_loggers.tests.test_chain": [], + "spacy_loggers.tests.test_cupy": [], + "spacy_loggers.tests.test_lookup": [], + "spacy_loggers.tests.test_registry": [], + "spacy_loggers.tests.util": [], + "spacy_loggers.util": [], + "spacy_loggers.wandb": [] + }, "spark-nlp": { "com": [], "com.johnsnowlabs": [], From b1d03c94b2b8074fe5d0c30f33702d4df27ac270 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 4 Jul 2024 12:03:06 +0200 Subject: [PATCH 04/22] whitelist nltk (#1989) ## Changes whitelist nltk ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 358 ++++++++++++++++++ 1 file changed, 358 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 60f236e455..2eed0df1b5 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -5020,6 +5020,364 @@ "networkx": { "networkx": [] }, + "nltk": { + "nltk": [], + "nltk.app": [], + "nltk.app.chartparser_app": [], + "nltk.app.chunkparser_app": [], + "nltk.app.collocations_app": [], + "nltk.app.concordance_app": [], + "nltk.app.nemo_app": [], + "nltk.app.rdparser_app": [], + "nltk.app.srparser_app": [], + "nltk.app.wordfreq_app": [], + "nltk.app.wordnet_app": [], + "nltk.book": [], + "nltk.ccg": [], + "nltk.ccg.api": [], + "nltk.ccg.chart": [], + "nltk.ccg.combinator": [], + "nltk.ccg.lexicon": [], + "nltk.ccg.logic": [], + "nltk.chat": [], + "nltk.chat.eliza": [], + "nltk.chat.iesha": [], + "nltk.chat.rude": [], + "nltk.chat.suntsu": [], + "nltk.chat.util": [], + "nltk.chat.zen": [], + "nltk.chunk": [], + "nltk.chunk.api": [], + "nltk.chunk.named_entity": [], + "nltk.chunk.regexp": [], + "nltk.chunk.util": [], + "nltk.classify": [], + "nltk.classify.api": [], + "nltk.classify.decisiontree": [], + "nltk.classify.maxent": [], + "nltk.classify.megam": [], + "nltk.classify.naivebayes": [], + "nltk.classify.positivenaivebayes": [], + "nltk.classify.rte_classify": [], + "nltk.classify.scikitlearn": [], + "nltk.classify.senna": [], + "nltk.classify.svm": [], + "nltk.classify.tadm": [], + "nltk.classify.textcat": [], + "nltk.classify.util": [], + "nltk.classify.weka": [], + "nltk.cli": [], + "nltk.cluster": [], + "nltk.cluster.api": [], + "nltk.cluster.em": [], + "nltk.cluster.gaac": [], + "nltk.cluster.kmeans": [], + "nltk.cluster.util": [], + "nltk.collections": [], + "nltk.collocations": [], + "nltk.compat": [], + "nltk.corpus": [], + "nltk.corpus.europarl_raw": [], + "nltk.corpus.reader": [], + "nltk.corpus.reader.aligned": [], + "nltk.corpus.reader.api": [], + "nltk.corpus.reader.bcp47": [], + "nltk.corpus.reader.bnc": [], + "nltk.corpus.reader.bracket_parse": [], + "nltk.corpus.reader.categorized_sents": [], + "nltk.corpus.reader.chasen": [], + "nltk.corpus.reader.childes": [], + "nltk.corpus.reader.chunked": [], + "nltk.corpus.reader.cmudict": [], + "nltk.corpus.reader.comparative_sents": [], + "nltk.corpus.reader.conll": [], + "nltk.corpus.reader.crubadan": [], + "nltk.corpus.reader.dependency": [], + "nltk.corpus.reader.framenet": [], + "nltk.corpus.reader.ieer": [], + "nltk.corpus.reader.indian": [], + "nltk.corpus.reader.ipipan": [], + "nltk.corpus.reader.knbc": [], + "nltk.corpus.reader.lin": [], + "nltk.corpus.reader.markdown": [], + "nltk.corpus.reader.mte": [], + "nltk.corpus.reader.nkjp": [], + "nltk.corpus.reader.nombank": [], + "nltk.corpus.reader.nps_chat": [], + "nltk.corpus.reader.opinion_lexicon": [], + "nltk.corpus.reader.panlex_lite": [], + "nltk.corpus.reader.panlex_swadesh": [], + "nltk.corpus.reader.pl196x": [], + "nltk.corpus.reader.plaintext": [], + "nltk.corpus.reader.ppattach": [], + "nltk.corpus.reader.propbank": [], + "nltk.corpus.reader.pros_cons": [], + "nltk.corpus.reader.reviews": [], + "nltk.corpus.reader.rte": [], + "nltk.corpus.reader.semcor": [], + "nltk.corpus.reader.senseval": [], + "nltk.corpus.reader.sentiwordnet": [], + "nltk.corpus.reader.sinica_treebank": [], + "nltk.corpus.reader.string_category": [], + "nltk.corpus.reader.switchboard": [], + "nltk.corpus.reader.tagged": [], + "nltk.corpus.reader.timit": [], + "nltk.corpus.reader.toolbox": [], + "nltk.corpus.reader.twitter": [], + "nltk.corpus.reader.udhr": [], + "nltk.corpus.reader.util": [], + "nltk.corpus.reader.verbnet": [], + "nltk.corpus.reader.wordlist": [], + "nltk.corpus.reader.wordnet": [], + "nltk.corpus.reader.xmldocs": [], + "nltk.corpus.reader.ycoe": [], + "nltk.corpus.util": [], + "nltk.data": [], + "nltk.decorators": [], + "nltk.downloader": [], + "nltk.draw": [], + "nltk.draw.cfg": [], + "nltk.draw.dispersion": [], + "nltk.draw.table": [], + "nltk.draw.tree": [], + "nltk.draw.util": [], + "nltk.featstruct": [], + "nltk.grammar": [], + "nltk.help": [], + "nltk.inference": [], + "nltk.inference.api": [], + "nltk.inference.discourse": [], + "nltk.inference.mace": [], + "nltk.inference.nonmonotonic": [], + "nltk.inference.prover9": [], + "nltk.inference.resolution": [], + "nltk.inference.tableau": [], + "nltk.internals": [], + "nltk.jsontags": [], + "nltk.langnames": [], + "nltk.lazyimport": [], + "nltk.lm": [], + "nltk.lm.api": [], + "nltk.lm.counter": [], + "nltk.lm.models": [], + "nltk.lm.preprocessing": [], + "nltk.lm.smoothing": [], + "nltk.lm.util": [], + "nltk.lm.vocabulary": [], + "nltk.metrics": [], + "nltk.metrics.agreement": [], + "nltk.metrics.aline": [], + "nltk.metrics.association": [], + "nltk.metrics.confusionmatrix": [], + "nltk.metrics.distance": [], + "nltk.metrics.paice": [], + "nltk.metrics.scores": [], + "nltk.metrics.segmentation": [], + "nltk.metrics.spearman": [], + "nltk.misc": [], + "nltk.misc.babelfish": [], + "nltk.misc.chomsky": [], + "nltk.misc.minimalset": [], + "nltk.misc.sort": [], + "nltk.misc.wordfinder": [], + "nltk.parse": [], + "nltk.parse.api": [], + "nltk.parse.bllip": [], + "nltk.parse.chart": [], + "nltk.parse.corenlp": [], + "nltk.parse.dependencygraph": [], + "nltk.parse.earleychart": [], + "nltk.parse.evaluate": [], + "nltk.parse.featurechart": [], + "nltk.parse.generate": [], + "nltk.parse.malt": [], + "nltk.parse.nonprojectivedependencyparser": [], + "nltk.parse.pchart": [], + "nltk.parse.projectivedependencyparser": [], + "nltk.parse.recursivedescent": [], + "nltk.parse.shiftreduce": [], + "nltk.parse.stanford": [], + "nltk.parse.transitionparser": [], + "nltk.parse.util": [], + "nltk.parse.viterbi": [], + "nltk.probability": [], + "nltk.sem": [], + "nltk.sem.boxer": [], + "nltk.sem.chat80": [], + "nltk.sem.cooper_storage": [], + "nltk.sem.drt": [], + "nltk.sem.drt_glue_demo": [], + "nltk.sem.evaluate": [], + "nltk.sem.glue": [], + "nltk.sem.hole": [], + "nltk.sem.lfg": [], + "nltk.sem.linearlogic": [], + "nltk.sem.logic": [], + "nltk.sem.relextract": [], + "nltk.sem.skolemize": [], + "nltk.sem.util": [], + "nltk.sentiment": [], + "nltk.sentiment.sentiment_analyzer": [], + "nltk.sentiment.util": [], + "nltk.sentiment.vader": [], + "nltk.stem": [], + "nltk.stem.api": [], + "nltk.stem.arlstem": [], + "nltk.stem.arlstem2": [], + "nltk.stem.cistem": [], + "nltk.stem.isri": [], + "nltk.stem.lancaster": [], + "nltk.stem.porter": [], + "nltk.stem.regexp": [], + "nltk.stem.rslp": [], + "nltk.stem.snowball": [], + "nltk.stem.util": [], + "nltk.stem.wordnet": [], + "nltk.tag": [], + "nltk.tag.api": [], + "nltk.tag.brill": [], + "nltk.tag.brill_trainer": [], + "nltk.tag.crf": [], + "nltk.tag.hmm": [], + "nltk.tag.hunpos": [], + "nltk.tag.mapping": [], + "nltk.tag.perceptron": [], + "nltk.tag.senna": [], + "nltk.tag.sequential": [], + "nltk.tag.stanford": [], + "nltk.tag.tnt": [], + "nltk.tag.util": [], + "nltk.tbl": [], + "nltk.tbl.api": [], + "nltk.tbl.demo": [], + "nltk.tbl.erroranalysis": [], + "nltk.tbl.feature": [], + "nltk.tbl.rule": [], + "nltk.tbl.template": [], + "nltk.test": [], + "nltk.test.all": [], + "nltk.test.childes_fixt": [], + "nltk.test.classify_fixt": [], + "nltk.test.conftest": [], + "nltk.test.gensim_fixt": [], + "nltk.test.gluesemantics_malt_fixt": [], + "nltk.test.portuguese_en_fixt": [], + "nltk.test.probability_fixt": [], + "nltk.test.setup_fixt": [], + "nltk.test.unit": [], + "nltk.test.unit.lm": [], + "nltk.test.unit.lm.test_counter": [], + "nltk.test.unit.lm.test_models": [], + "nltk.test.unit.lm.test_preprocessing": [], + "nltk.test.unit.lm.test_vocabulary": [], + "nltk.test.unit.test_aline": [], + "nltk.test.unit.test_bllip": [], + "nltk.test.unit.test_brill": [], + "nltk.test.unit.test_cfd_mutation": [], + "nltk.test.unit.test_cfg2chomsky": [], + "nltk.test.unit.test_chunk": [], + "nltk.test.unit.test_classify": [], + "nltk.test.unit.test_collocations": [], + "nltk.test.unit.test_concordance": [], + "nltk.test.unit.test_corenlp": [], + "nltk.test.unit.test_corpora": [], + "nltk.test.unit.test_corpus_views": [], + "nltk.test.unit.test_data": [], + "nltk.test.unit.test_disagreement": [], + "nltk.test.unit.test_distance": [], + "nltk.test.unit.test_downloader": [], + "nltk.test.unit.test_freqdist": [], + "nltk.test.unit.test_hmm": [], + "nltk.test.unit.test_json2csv_corpus": [], + "nltk.test.unit.test_json_serialization": [], + "nltk.test.unit.test_metrics": [], + "nltk.test.unit.test_naivebayes": [], + "nltk.test.unit.test_nombank": [], + "nltk.test.unit.test_pl196x": [], + "nltk.test.unit.test_pos_tag": [], + "nltk.test.unit.test_ribes": [], + "nltk.test.unit.test_rte_classify": [], + "nltk.test.unit.test_seekable_unicode_stream_reader": [], + "nltk.test.unit.test_senna": [], + "nltk.test.unit.test_stem": [], + "nltk.test.unit.test_tag": [], + "nltk.test.unit.test_tgrep": [], + "nltk.test.unit.test_tokenize": [], + "nltk.test.unit.test_twitter_auth": [], + "nltk.test.unit.test_util": [], + "nltk.test.unit.test_wordnet": [], + "nltk.test.unit.translate": [], + "nltk.test.unit.translate.test_bleu": [], + "nltk.test.unit.translate.test_gdfa": [], + "nltk.test.unit.translate.test_ibm1": [], + "nltk.test.unit.translate.test_ibm2": [], + "nltk.test.unit.translate.test_ibm3": [], + "nltk.test.unit.translate.test_ibm4": [], + "nltk.test.unit.translate.test_ibm5": [], + "nltk.test.unit.translate.test_ibm_model": [], + "nltk.test.unit.translate.test_meteor": [], + "nltk.test.unit.translate.test_nist": [], + "nltk.test.unit.translate.test_stack_decoder": [], + "nltk.text": [], + "nltk.tgrep": [], + "nltk.tokenize": [], + "nltk.tokenize.api": [], + "nltk.tokenize.casual": [], + "nltk.tokenize.destructive": [], + "nltk.tokenize.legality_principle": [], + "nltk.tokenize.mwe": [], + "nltk.tokenize.nist": [], + "nltk.tokenize.punkt": [], + "nltk.tokenize.regexp": [], + "nltk.tokenize.repp": [], + "nltk.tokenize.sexpr": [], + "nltk.tokenize.simple": [], + "nltk.tokenize.sonority_sequencing": [], + "nltk.tokenize.stanford": [], + "nltk.tokenize.stanford_segmenter": [], + "nltk.tokenize.texttiling": [], + "nltk.tokenize.toktok": [], + "nltk.tokenize.treebank": [], + "nltk.tokenize.util": [], + "nltk.toolbox": [], + "nltk.translate": [], + "nltk.translate.api": [], + "nltk.translate.bleu_score": [], + "nltk.translate.chrf_score": [], + "nltk.translate.gale_church": [], + "nltk.translate.gdfa": [], + "nltk.translate.gleu_score": [], + "nltk.translate.ibm1": [], + "nltk.translate.ibm2": [], + "nltk.translate.ibm3": [], + "nltk.translate.ibm4": [], + "nltk.translate.ibm5": [], + "nltk.translate.ibm_model": [], + "nltk.translate.meteor_score": [], + "nltk.translate.metrics": [], + "nltk.translate.nist_score": [], + "nltk.translate.phrase_based": [], + "nltk.translate.ribes_score": [], + "nltk.translate.stack_decoder": [], + "nltk.tree": [], + "nltk.tree.immutable": [], + "nltk.tree.parented": [], + "nltk.tree.parsing": [], + "nltk.tree.prettyprinter": [], + "nltk.tree.probabilistic": [], + "nltk.tree.transforms": [], + "nltk.tree.tree": [], + "nltk.treeprettyprinter": [], + "nltk.treetransforms": [], + "nltk.twitter": [], + "nltk.twitter.api": [], + "nltk.twitter.common": [], + "nltk.twitter.twitter_demo": [], + "nltk.twitter.twitterclient": [], + "nltk.twitter.util": [], + "nltk.util": [], + "nltk.wsd": [] + }, "numba": { "numba": [], "numba._version": [], From 235ee59b3ca426070aa7d75584302f4342a743ae Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 4 Jul 2024 12:03:28 +0200 Subject: [PATCH 05/22] whitelist datasetsforecast (#1995) ## Changes whitelist datasetsforecast ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/known.json | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 2eed0df1b5..f166aea33a 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -1394,6 +1394,23 @@ "databricks-sdk": { "databricks.sdk": [] }, + "datasetsforecast": { + "action_files": [], + "action_files.test_dask": [], + "action_files.test_ray": [], + "action_files.utils": [], + "datasetsforecast": [], + "datasetsforecast._modidx": [], + "datasetsforecast._nbdev": [], + "datasetsforecast.evaluation": [], + "datasetsforecast.hierarchical": [], + "datasetsforecast.long_horizon": [], + "datasetsforecast.losses": [], + "datasetsforecast.m3": [], + "datasetsforecast.m4": [], + "datasetsforecast.m5": [], + "datasetsforecast.utils": [] + }, "dbldatagen": { "dbldatagen": [], "dbldatagen._version": [], From 772dce606d9041d6f6207e1a9c0c27b67c4c31bf Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 4 Jul 2024 12:03:47 +0200 Subject: [PATCH 06/22] whitelist petastorm (#1996) ## Changes whitelist petastorm ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index f166aea33a..37a055d070 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -6496,6 +6496,183 @@ "peft.utils.peft_types": [], "peft.utils.save_and_load": [] }, + "petastorm": { + "petastorm": [], + "petastorm.arrow_reader_worker": [], + "petastorm.benchmark": [], + "petastorm.benchmark.cli": [], + "petastorm.benchmark.dummy_reader": [], + "petastorm.benchmark.throughput": [], + "petastorm.cache": [], + "petastorm.codecs": [], + "petastorm.errors": [], + "petastorm.etl": [], + "petastorm.etl.dataset_metadata": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "petastorm.etl.legacy": [], + "petastorm.etl.metadata_util": [], + "petastorm.etl.petastorm_generate_metadata": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sc is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "petastorm.etl.rowgroup_indexers": [], + "petastorm.etl.rowgroup_indexing": [], + "petastorm.fs_utils": [], + "petastorm.generator": [], + "petastorm.hdfs": [], + "petastorm.hdfs.namenode": [], + "petastorm.hdfs.tests": [], + "petastorm.hdfs.tests.test_hdfs_namenode": [], + "petastorm.local_disk_cache": [], + "petastorm.namedtuple_gt_255_fields": [], + "petastorm.ngram": [], + "petastorm.predicates": [], + "petastorm.py_dict_reader_worker": [], + "petastorm.pyarrow_helpers": [], + "petastorm.pyarrow_helpers.batching_table_queue": [], + "petastorm.pytorch": [], + "petastorm.reader": [], + "petastorm.reader_impl": [], + "petastorm.reader_impl.arrow_table_serializer": [], + "petastorm.reader_impl.pickle_serializer": [], + "petastorm.reader_impl.pytorch_shuffling_buffer": [], + "petastorm.reader_impl.shuffling_buffer": [], + "petastorm.selectors": [], + "petastorm.spark": [], + "petastorm.spark.spark_dataset_converter": [ + { + "code": "jvm-access-in-shared-clusters", + "message": "Cannot access Spark Driver JVM on UC Shared Clusters" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "petastorm.spark_utils": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "rdd-in-shared-clusters", + "message": "RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API" + } + ], + "petastorm.test_util": [], + "petastorm.test_util.reader_mock": [], + "petastorm.test_util.shuffling_analysis": [], + "petastorm.tests": [], + "petastorm.tests.bootstrap_test_schema_data": [], + "petastorm.tests.conftest": [], + "petastorm.tests.generate_dataset_for_legacy_tests": [], + "petastorm.tests.tempdir": [], + "petastorm.tests.test_arrow_table_serializer": [], + "petastorm.tests.test_benchmark": [], + "petastorm.tests.test_cache": [], + "petastorm.tests.test_codec_compressed_image": [], + "petastorm.tests.test_codec_ndarray": [], + "petastorm.tests.test_codec_scalar": [], + "petastorm.tests.test_common": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "petastorm.tests.test_copy_dataset": [], + "petastorm.tests.test_dataset_metadata": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sc is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "rdd-in-shared-clusters", + "message": "RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API" + } + ], + "petastorm.tests.test_decode_row": [], + "petastorm.tests.test_disk_cache": [], + "petastorm.tests.test_end_to_end": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "petastorm.tests.test_end_to_end_predicates_impl": [], + "petastorm.tests.test_fs_utils": [], + "petastorm.tests.test_generate_metadata": [], + "petastorm.tests.test_metadata_read": [], + "petastorm.tests.test_ngram": [], + "petastorm.tests.test_ngram_end_to_end": [], + "petastorm.tests.test_parquet_reader": [], + "petastorm.tests.test_pickle_serializer": [], + "petastorm.tests.test_predicates": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sc is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "rdd-in-shared-clusters", + "message": "RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API" + } + ], + "petastorm.tests.test_pytorch_dataloader": [], + "petastorm.tests.test_pytorch_utils": [], + "petastorm.tests.test_reader": [], + "petastorm.tests.test_reader_mock": [], + "petastorm.tests.test_reading_legacy_datasets": [], + "petastorm.tests.test_run_in_subprocess": [], + "petastorm.tests.test_shuffling_buffer": [], + "petastorm.tests.test_spark_dataset_converter": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "petastorm.tests.test_spark_session_cli": [], + "petastorm.tests.test_spark_utils": [], + "petastorm.tests.test_tf_autograph": [], + "petastorm.tests.test_tf_dataset": [], + "petastorm.tests.test_tf_utils": [], + "petastorm.tests.test_transform": [], + "petastorm.tests.test_unischema": [], + "petastorm.tests.test_weighted_sampling_reader": [], + "petastorm.tf_utils": [], + "petastorm.tools": [], + "petastorm.tools.copy_dataset": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "petastorm.tools.spark_session_cli": [], + "petastorm.transform": [], + "petastorm.unischema": [], + "petastorm.utils": [], + "petastorm.weighted_sampling_reader": [], + "petastorm.workers_pool": [], + "petastorm.workers_pool.dummy_pool": [], + "petastorm.workers_pool.exec_in_new_process": [], + "petastorm.workers_pool.exec_in_new_process_entrypoint": [], + "petastorm.workers_pool.process_pool": [], + "petastorm.workers_pool.tests": [], + "petastorm.workers_pool.tests.stub_workers": [], + "petastorm.workers_pool.tests.test_ventilator": [], + "petastorm.workers_pool.tests.test_workers_pool": [], + "petastorm.workers_pool.thread_pool": [], + "petastorm.workers_pool.ventilator": [], + "petastorm.workers_pool.worker_base": [] + }, "pexpect": { "pexpect": [] }, From c9c16201df44bbc7e6a1ceeb75bda9f34cd94c46 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 4 Jul 2024 12:04:38 +0200 Subject: [PATCH 07/22] whitelist dbx (#2001) ## Changes whitelist dbx ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested --------- Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 37a055d070..096c72d6f9 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -1460,6 +1460,129 @@ "dbldatagen.text_generators": [], "dbldatagen.utils": [] }, + "dbx": { + "dbx": [], + "dbx.api": [], + "dbx.api._module_loader": [], + "dbx.api.adjuster": [], + "dbx.api.adjuster.adjuster": [], + "dbx.api.adjuster.mixins": [], + "dbx.api.adjuster.mixins.base": [], + "dbx.api.adjuster.mixins.existing_cluster": [], + "dbx.api.adjuster.mixins.file_reference": [], + "dbx.api.adjuster.mixins.instance_pool": [], + "dbx.api.adjuster.mixins.instance_profile": [], + "dbx.api.adjuster.mixins.pipeline": [], + "dbx.api.adjuster.mixins.service_principal": [], + "dbx.api.adjuster.mixins.sql_properties": [], + "dbx.api.adjuster.policy": [], + "dbx.api.auth": [], + "dbx.api.build": [], + "dbx.api.client_provider": [], + "dbx.api.cluster": [], + "dbx.api.config_reader": [], + "dbx.api.configure": [], + "dbx.api.context": [], + "dbx.api.dependency": [], + "dbx.api.dependency.core_package": [], + "dbx.api.dependency.requirements": [], + "dbx.api.deployment": [], + "dbx.api.destroyer": [], + "dbx.api.execute": [], + "dbx.api.jinja": [], + "dbx.api.launch": [], + "dbx.api.launch.contexts": [], + "dbx.api.launch.functions": [], + "dbx.api.launch.pipeline_models": [], + "dbx.api.launch.processors": [], + "dbx.api.launch.runners": [], + "dbx.api.launch.runners.asset_based": [], + "dbx.api.launch.runners.base": [], + "dbx.api.launch.runners.pipeline": [], + "dbx.api.launch.runners.standard": [], + "dbx.api.launch.tracer": [], + "dbx.api.output_provider": [], + "dbx.api.services": [], + "dbx.api.services._base": [], + "dbx.api.services.jobs": [], + "dbx.api.services.permissions": [], + "dbx.api.services.pipelines": [], + "dbx.api.storage": [], + "dbx.api.storage.io": [], + "dbx.api.storage.mlflow_based": [], + "dbx.callbacks": [], + "dbx.cli": [], + "dbx.commands": [], + "dbx.commands.configure": [], + "dbx.commands.deploy": [], + "dbx.commands.destroy": [], + "dbx.commands.execute": [], + "dbx.commands.init": [], + "dbx.commands.launch": [], + "dbx.commands.sync": [], + "dbx.commands.sync.functions": [], + "dbx.commands.sync.options": [], + "dbx.commands.sync.sync": [], + "dbx.commands.version": [], + "dbx.constants": [], + "dbx.custom": [], + "dbx.models": [], + "dbx.models.build": [], + "dbx.models.cli": [], + "dbx.models.cli.destroyer": [], + "dbx.models.cli.execute": [], + "dbx.models.cli.options": [], + "dbx.models.deployment": [], + "dbx.models.files": [], + "dbx.models.files.context": [], + "dbx.models.files.project": [], + "dbx.models.validators": [], + "dbx.models.workflow": [], + "dbx.models.workflow.common": [], + "dbx.models.workflow.common.access_control": [], + "dbx.models.workflow.common.deployment_config": [], + "dbx.models.workflow.common.flexible": [], + "dbx.models.workflow.common.job_email_notifications": [], + "dbx.models.workflow.common.libraries": [], + "dbx.models.workflow.common.new_cluster": [], + "dbx.models.workflow.common.parameters": [], + "dbx.models.workflow.common.pipeline": [], + "dbx.models.workflow.common.task": [], + "dbx.models.workflow.common.task_type": [], + "dbx.models.workflow.common.workflow": [], + "dbx.models.workflow.common.workflow_types": [], + "dbx.models.workflow.v2dot0": [], + "dbx.models.workflow.v2dot0.parameters": [], + "dbx.models.workflow.v2dot0.task": [], + "dbx.models.workflow.v2dot0.workflow": [], + "dbx.models.workflow.v2dot1": [], + "dbx.models.workflow.v2dot1._parameters": [], + "dbx.models.workflow.v2dot1.job_cluster": [], + "dbx.models.workflow.v2dot1.job_task_settings": [], + "dbx.models.workflow.v2dot1.parameters": [], + "dbx.models.workflow.v2dot1.task": [], + "dbx.models.workflow.v2dot1.workflow": [], + "dbx.options": [], + "dbx.sync": [], + "dbx.sync.clients": [], + "dbx.sync.config": [], + "dbx.sync.constants": [], + "dbx.sync.event_handler": [], + "dbx.sync.path_matcher": [], + "dbx.sync.snapshot": [], + "dbx.templates.projects.python_basic.render.hooks.post_gen_project": [], + "dbx.types": [], + "dbx.utils": [], + "dbx.utils.common": [], + "dbx.utils.file_uploader": [ + { + "code": "dbfs-usage", + "message": "Deprecated file system path: dbfs:/" + } + ], + "dbx.utils.json": [], + "dbx.utils.url": [] + }, "debugpy": { "debugpy": [] }, From 93bc10ec79f80af64999503a664f00347b015a12 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:31:20 +0200 Subject: [PATCH 08/22] whitelist deepspeed (#2002) ## Changes whitelist deepspeed ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 534 ++++++++++++++++++ 1 file changed, 534 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 096c72d6f9..902fda95f9 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -1589,6 +1589,540 @@ "decorator": { "decorator": [] }, + "deepspeed": { + "deepspeed": [], + "deepspeed.accelerator": [], + "deepspeed.accelerator.abstract_accelerator": [], + "deepspeed.accelerator.cpu_accelerator": [], + "deepspeed.accelerator.cuda_accelerator": [], + "deepspeed.accelerator.hpu_accelerator": [], + "deepspeed.accelerator.mps_accelerator": [], + "deepspeed.accelerator.npu_accelerator": [], + "deepspeed.accelerator.real_accelerator": [], + "deepspeed.accelerator.xpu_accelerator": [], + "deepspeed.autotuning": [], + "deepspeed.autotuning.autotuner": [], + "deepspeed.autotuning.config": [], + "deepspeed.autotuning.constants": [], + "deepspeed.autotuning.scheduler": [], + "deepspeed.autotuning.tuner": [], + "deepspeed.autotuning.tuner.base_tuner": [], + "deepspeed.autotuning.tuner.cost_model": [], + "deepspeed.autotuning.tuner.index_based_tuner": [], + "deepspeed.autotuning.tuner.model_based_tuner": [], + "deepspeed.autotuning.tuner.utils": [], + "deepspeed.autotuning.utils": [], + "deepspeed.checkpoint": [], + "deepspeed.checkpoint.constants": [], + "deepspeed.checkpoint.deepspeed_checkpoint": [], + "deepspeed.checkpoint.ds_to_universal": [], + "deepspeed.checkpoint.reshape_3d_utils": [], + "deepspeed.checkpoint.reshape_meg_2d": [], + "deepspeed.checkpoint.reshape_utils": [], + "deepspeed.checkpoint.universal_checkpoint": [], + "deepspeed.checkpoint.utils": [], + "deepspeed.checkpoint.zero_checkpoint": [], + "deepspeed.comm": [], + "deepspeed.comm.backend": [], + "deepspeed.comm.ccl": [], + "deepspeed.comm.comm": [], + "deepspeed.comm.config": [], + "deepspeed.comm.constants": [], + "deepspeed.comm.reduce_op": [], + "deepspeed.comm.torch": [], + "deepspeed.comm.utils": [], + "deepspeed.compression": [], + "deepspeed.compression.basic_layer": [], + "deepspeed.compression.compress": [], + "deepspeed.compression.config": [], + "deepspeed.compression.constants": [], + "deepspeed.compression.helper": [], + "deepspeed.compression.scheduler": [], + "deepspeed.compression.utils": [], + "deepspeed.constants": [], + "deepspeed.elasticity": [], + "deepspeed.elasticity.config": [], + "deepspeed.elasticity.constants": [], + "deepspeed.elasticity.elastic_agent": [], + "deepspeed.elasticity.elasticity": [], + "deepspeed.elasticity.utils": [], + "deepspeed.env_report": [], + "deepspeed.git_version_info": [], + "deepspeed.git_version_info_installed": [], + "deepspeed.inference": [], + "deepspeed.inference.config": [], + "deepspeed.inference.engine": [], + "deepspeed.inference.quantization": [], + "deepspeed.inference.quantization.layers": [], + "deepspeed.inference.quantization.quantization": [], + "deepspeed.inference.quantization.quantization_context": [], + "deepspeed.inference.quantization.utils": [], + "deepspeed.inference.v2": [], + "deepspeed.inference.v2.allocator": [], + "deepspeed.inference.v2.checkpoint": [], + "deepspeed.inference.v2.checkpoint.base_engine": [], + "deepspeed.inference.v2.checkpoint.huggingface_engine": [], + "deepspeed.inference.v2.checkpoint.in_memory_engine": [], + "deepspeed.inference.v2.config_v2": [], + "deepspeed.inference.v2.engine_factory": [], + "deepspeed.inference.v2.engine_v2": [], + "deepspeed.inference.v2.inference_parameter": [], + "deepspeed.inference.v2.inference_utils": [], + "deepspeed.inference.v2.kernels": [], + "deepspeed.inference.v2.kernels.core_ops": [], + "deepspeed.inference.v2.kernels.core_ops.bias_activations": [], + "deepspeed.inference.v2.kernels.core_ops.bias_activations.bias_activation": [], + "deepspeed.inference.v2.kernels.core_ops.blas_kernels": [], + "deepspeed.inference.v2.kernels.core_ops.blas_kernels.blas_linear": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_layer_norm": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_layer_norm.cuda_fp_ln_base": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_layer_norm.cuda_ln": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_layer_norm.cuda_post_ln": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_layer_norm.cuda_pre_ln": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_linear": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_linear.cuda_linear": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_rms_norm": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_rms_norm.rms_norm": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_rms_norm.rms_norm_base": [], + "deepspeed.inference.v2.kernels.core_ops.cuda_rms_norm.rms_pre_norm": [], + "deepspeed.inference.v2.kernels.core_ops.gated_activations": [], + "deepspeed.inference.v2.kernels.core_ops.gated_activations.gated_activation": [], + "deepspeed.inference.v2.kernels.cutlass_ops": [], + "deepspeed.inference.v2.kernels.cutlass_ops.mixed_gemm": [], + "deepspeed.inference.v2.kernels.cutlass_ops.mixed_gemm.mixed_gemm": [], + "deepspeed.inference.v2.kernels.cutlass_ops.moe_gemm": [], + "deepspeed.inference.v2.kernels.cutlass_ops.moe_gemm.mixed_moe_gemm": [], + "deepspeed.inference.v2.kernels.cutlass_ops.moe_gemm.moe_gemm": [], + "deepspeed.inference.v2.kernels.ds_kernel": [], + "deepspeed.inference.v2.kernels.ragged_ops": [], + "deepspeed.inference.v2.kernels.ragged_ops.atom_builder": [], + "deepspeed.inference.v2.kernels.ragged_ops.atom_builder.atom_builder": [], + "deepspeed.inference.v2.kernels.ragged_ops.blocked_flash": [], + "deepspeed.inference.v2.kernels.ragged_ops.blocked_flash.blocked_flash": [], + "deepspeed.inference.v2.kernels.ragged_ops.embed": [], + "deepspeed.inference.v2.kernels.ragged_ops.embed.embed": [], + "deepspeed.inference.v2.kernels.ragged_ops.linear_blocked_kv_rotary": [], + "deepspeed.inference.v2.kernels.ragged_ops.linear_blocked_kv_rotary.blocked_kv_rotary": [], + "deepspeed.inference.v2.kernels.ragged_ops.linear_blocked_kv_rotary.blocked_trained_kv_rotary": [], + "deepspeed.inference.v2.kernels.ragged_ops.linear_blocked_kv_rotary.linear_blocked_kv_copy": [], + "deepspeed.inference.v2.kernels.ragged_ops.logits_gather": [], + "deepspeed.inference.v2.kernels.ragged_ops.logits_gather.logits_gather": [], + "deepspeed.inference.v2.kernels.ragged_ops.moe_gather": [], + "deepspeed.inference.v2.kernels.ragged_ops.moe_gather.moe_gather": [], + "deepspeed.inference.v2.kernels.ragged_ops.moe_scatter": [], + "deepspeed.inference.v2.kernels.ragged_ops.moe_scatter.moe_scatter": [], + "deepspeed.inference.v2.kernels.ragged_ops.top_k_gating": [], + "deepspeed.inference.v2.kernels.ragged_ops.top_k_gating.top_k_gating": [], + "deepspeed.inference.v2.logging": [], + "deepspeed.inference.v2.model_implementations": [], + "deepspeed.inference.v2.model_implementations.common_parameters": [], + "deepspeed.inference.v2.model_implementations.common_parameters.attn_output_parameters": [], + "deepspeed.inference.v2.model_implementations.common_parameters.embedding_parameters": [], + "deepspeed.inference.v2.model_implementations.common_parameters.invfreq_parameters": [], + "deepspeed.inference.v2.model_implementations.common_parameters.mlp_parameters": [], + "deepspeed.inference.v2.model_implementations.common_parameters.moe_parameters": [], + "deepspeed.inference.v2.model_implementations.common_parameters.norm_parameters": [], + "deepspeed.inference.v2.model_implementations.common_parameters.qkv_parameters": [], + "deepspeed.inference.v2.model_implementations.common_parameters.unembed_parameters": [], + "deepspeed.inference.v2.model_implementations.falcon": [], + "deepspeed.inference.v2.model_implementations.falcon.container": [], + "deepspeed.inference.v2.model_implementations.falcon.model": [], + "deepspeed.inference.v2.model_implementations.falcon.policy": [], + "deepspeed.inference.v2.model_implementations.flat_model_helpers": [], + "deepspeed.inference.v2.model_implementations.inference_model_base": [], + "deepspeed.inference.v2.model_implementations.inference_policy_base": [], + "deepspeed.inference.v2.model_implementations.inference_transformer_base": [], + "deepspeed.inference.v2.model_implementations.layer_container_base": [], + "deepspeed.inference.v2.model_implementations.llama_v2": [], + "deepspeed.inference.v2.model_implementations.llama_v2.container": [], + "deepspeed.inference.v2.model_implementations.llama_v2.model": [], + "deepspeed.inference.v2.model_implementations.llama_v2.policy": [], + "deepspeed.inference.v2.model_implementations.mistral": [], + "deepspeed.inference.v2.model_implementations.mistral.container": [], + "deepspeed.inference.v2.model_implementations.mistral.model": [], + "deepspeed.inference.v2.model_implementations.mistral.policy": [], + "deepspeed.inference.v2.model_implementations.mixtral": [], + "deepspeed.inference.v2.model_implementations.mixtral.container": [], + "deepspeed.inference.v2.model_implementations.mixtral.model": [], + "deepspeed.inference.v2.model_implementations.mixtral.policy": [], + "deepspeed.inference.v2.model_implementations.opt": [], + "deepspeed.inference.v2.model_implementations.opt.container": [], + "deepspeed.inference.v2.model_implementations.opt.model": [], + "deepspeed.inference.v2.model_implementations.opt.policy": [], + "deepspeed.inference.v2.model_implementations.parameter_base": [], + "deepspeed.inference.v2.model_implementations.phi": [], + "deepspeed.inference.v2.model_implementations.phi.containers": [], + "deepspeed.inference.v2.model_implementations.phi.model": [], + "deepspeed.inference.v2.model_implementations.phi.policy": [], + "deepspeed.inference.v2.model_implementations.qwen": [], + "deepspeed.inference.v2.model_implementations.qwen.container": [], + "deepspeed.inference.v2.model_implementations.qwen.model": [], + "deepspeed.inference.v2.model_implementations.qwen.policy": [], + "deepspeed.inference.v2.model_implementations.qwen_v2": [], + "deepspeed.inference.v2.model_implementations.qwen_v2.container": [], + "deepspeed.inference.v2.model_implementations.qwen_v2.model": [], + "deepspeed.inference.v2.model_implementations.qwen_v2.policy": [], + "deepspeed.inference.v2.model_implementations.sharding": [], + "deepspeed.inference.v2.model_implementations.sharding.attn": [], + "deepspeed.inference.v2.model_implementations.sharding.attn_out": [], + "deepspeed.inference.v2.model_implementations.sharding.embedding": [], + "deepspeed.inference.v2.model_implementations.sharding.mlp": [], + "deepspeed.inference.v2.model_implementations.sharding.qkv": [], + "deepspeed.inference.v2.model_implementations.sharding.types": [], + "deepspeed.inference.v2.model_implementations.sharding.unembed": [], + "deepspeed.inference.v2.model_implementations.sharding.utils": [], + "deepspeed.inference.v2.modules": [], + "deepspeed.inference.v2.modules.configs": [], + "deepspeed.inference.v2.modules.configs.attention_configs": [], + "deepspeed.inference.v2.modules.configs.embedding_config": [], + "deepspeed.inference.v2.modules.configs.linear_config": [], + "deepspeed.inference.v2.modules.configs.moe_config": [], + "deepspeed.inference.v2.modules.configs.norm_config": [], + "deepspeed.inference.v2.modules.configs.unembed_config": [], + "deepspeed.inference.v2.modules.ds_module": [], + "deepspeed.inference.v2.modules.heuristics": [], + "deepspeed.inference.v2.modules.implementations": [], + "deepspeed.inference.v2.modules.implementations.attention": [], + "deepspeed.inference.v2.modules.implementations.attention.dense_blocked_attention": [], + "deepspeed.inference.v2.modules.implementations.embedding": [], + "deepspeed.inference.v2.modules.implementations.embedding.ragged_embedding": [], + "deepspeed.inference.v2.modules.implementations.linear": [], + "deepspeed.inference.v2.modules.implementations.linear.blas_fp_linear": [], + "deepspeed.inference.v2.modules.implementations.linear.quantized_linear": [], + "deepspeed.inference.v2.modules.implementations.moe": [], + "deepspeed.inference.v2.modules.implementations.moe.cutlass_multi_gemm": [], + "deepspeed.inference.v2.modules.implementations.post_norm": [], + "deepspeed.inference.v2.modules.implementations.post_norm.cuda_post_ln": [], + "deepspeed.inference.v2.modules.implementations.pre_norm": [], + "deepspeed.inference.v2.modules.implementations.pre_norm.cuda_pre_ln": [], + "deepspeed.inference.v2.modules.implementations.pre_norm.cuda_pre_rms": [], + "deepspeed.inference.v2.modules.implementations.unembed": [], + "deepspeed.inference.v2.modules.implementations.unembed.ragged_unembed": [], + "deepspeed.inference.v2.modules.interfaces": [], + "deepspeed.inference.v2.modules.interfaces.attention_base": [], + "deepspeed.inference.v2.modules.interfaces.embedding_base": [], + "deepspeed.inference.v2.modules.interfaces.linear_base": [], + "deepspeed.inference.v2.modules.interfaces.moe_base": [], + "deepspeed.inference.v2.modules.interfaces.post_norm_base": [], + "deepspeed.inference.v2.modules.interfaces.pre_norm_base": [], + "deepspeed.inference.v2.modules.interfaces.unembed_base": [], + "deepspeed.inference.v2.modules.module_registry": [], + "deepspeed.inference.v2.ragged": [], + "deepspeed.inference.v2.ragged.blocked_allocator": [], + "deepspeed.inference.v2.ragged.kv_cache": [], + "deepspeed.inference.v2.ragged.manager_configs": [], + "deepspeed.inference.v2.ragged.ragged_manager": [], + "deepspeed.inference.v2.ragged.ragged_wrapper": [], + "deepspeed.inference.v2.ragged.sequence_descriptor": [], + "deepspeed.inference.v2.scheduling_utils": [], + "deepspeed.launcher": [], + "deepspeed.launcher.constants": [], + "deepspeed.launcher.launch": [], + "deepspeed.launcher.launcher_helper": [], + "deepspeed.launcher.multinode_runner": [], + "deepspeed.launcher.runner": [], + "deepspeed.linear": [], + "deepspeed.linear.config": [], + "deepspeed.linear.optimized_linear": [], + "deepspeed.linear.quantization": [], + "deepspeed.model_implementations": [], + "deepspeed.model_implementations.diffusers": [], + "deepspeed.model_implementations.diffusers.unet": [], + "deepspeed.model_implementations.diffusers.vae": [], + "deepspeed.model_implementations.features": [], + "deepspeed.model_implementations.features.cuda_graph": [], + "deepspeed.model_implementations.transformers": [], + "deepspeed.model_implementations.transformers.clip_encoder": [], + "deepspeed.model_implementations.transformers.ds_base": [], + "deepspeed.model_implementations.transformers.ds_bert": [], + "deepspeed.model_implementations.transformers.ds_bloom": [], + "deepspeed.model_implementations.transformers.ds_gpt": [], + "deepspeed.model_implementations.transformers.ds_llama2": [], + "deepspeed.model_implementations.transformers.ds_megatron_gpt": [], + "deepspeed.model_implementations.transformers.ds_opt": [], + "deepspeed.model_implementations.transformers.ds_transformer": [], + "deepspeed.module_inject": [], + "deepspeed.module_inject.auto_tp": [], + "deepspeed.module_inject.auto_tp_model_utils": [], + "deepspeed.module_inject.containers": [], + "deepspeed.module_inject.containers.base": [], + "deepspeed.module_inject.containers.base_moe": [], + "deepspeed.module_inject.containers.bert": [], + "deepspeed.module_inject.containers.bloom": [], + "deepspeed.module_inject.containers.clip": [], + "deepspeed.module_inject.containers.distil_bert": [], + "deepspeed.module_inject.containers.features": [], + "deepspeed.module_inject.containers.features.gated_mlp": [], + "deepspeed.module_inject.containers.features.hybrid_engine": [], + "deepspeed.module_inject.containers.features.hybrid_megatron": [], + "deepspeed.module_inject.containers.features.megatron": [], + "deepspeed.module_inject.containers.features.meta_tensor": [], + "deepspeed.module_inject.containers.features.split_qkv": [], + "deepspeed.module_inject.containers.gpt2": [], + "deepspeed.module_inject.containers.gptj": [], + "deepspeed.module_inject.containers.gptneo": [], + "deepspeed.module_inject.containers.gptneox": [], + "deepspeed.module_inject.containers.internlm": [], + "deepspeed.module_inject.containers.llama": [], + "deepspeed.module_inject.containers.llama2": [], + "deepspeed.module_inject.containers.megatron_gpt": [], + "deepspeed.module_inject.containers.megatron_gpt_moe": [], + "deepspeed.module_inject.containers.opt": [], + "deepspeed.module_inject.containers.unet": [], + "deepspeed.module_inject.containers.vae": [], + "deepspeed.module_inject.fusedqkv_utils": [], + "deepspeed.module_inject.inject": [], + "deepspeed.module_inject.layers": [], + "deepspeed.module_inject.load_checkpoint": [], + "deepspeed.module_inject.module_quantize": [], + "deepspeed.module_inject.policy": [], + "deepspeed.module_inject.replace_module": [], + "deepspeed.module_inject.replace_policy": [], + "deepspeed.module_inject.tp_shard": [], + "deepspeed.module_inject.utils": [], + "deepspeed.moe": [], + "deepspeed.moe.experts": [], + "deepspeed.moe.layer": [], + "deepspeed.moe.mappings": [], + "deepspeed.moe.sharded_moe": [], + "deepspeed.moe.utils": [], + "deepspeed.monitor": [], + "deepspeed.monitor.comet": [], + "deepspeed.monitor.config": [], + "deepspeed.monitor.csv_monitor": [], + "deepspeed.monitor.monitor": [], + "deepspeed.monitor.tensorboard": [], + "deepspeed.monitor.utils": [], + "deepspeed.monitor.wandb": [], + "deepspeed.nebula": [], + "deepspeed.nebula.config": [], + "deepspeed.nebula.constants": [], + "deepspeed.ops": [], + "deepspeed.ops.adagrad": [], + "deepspeed.ops.adagrad.cpu_adagrad": [], + "deepspeed.ops.adam": [], + "deepspeed.ops.adam.cpu_adam": [], + "deepspeed.ops.adam.fused_adam": [], + "deepspeed.ops.adam.multi_tensor_apply": [], + "deepspeed.ops.aio": [], + "deepspeed.ops.deepspeed4science": [], + "deepspeed.ops.deepspeed4science.evoformer_attn": [], + "deepspeed.ops.fp_quantizer": [], + "deepspeed.ops.fp_quantizer.quantize": [], + "deepspeed.ops.lamb": [], + "deepspeed.ops.lamb.fused_lamb": [], + "deepspeed.ops.lion": [], + "deepspeed.ops.lion.cpu_lion": [], + "deepspeed.ops.lion.fused_lion": [], + "deepspeed.ops.lion.multi_tensor_apply": [], + "deepspeed.ops.op_builder": [], + "deepspeed.ops.op_builder.all_ops": [], + "deepspeed.ops.op_builder.async_io": [], + "deepspeed.ops.op_builder.builder": [], + "deepspeed.ops.op_builder.cpu": [], + "deepspeed.ops.op_builder.cpu.builder": [], + "deepspeed.ops.op_builder.cpu.comm": [], + "deepspeed.ops.op_builder.cpu.cpu_adam": [], + "deepspeed.ops.op_builder.cpu.fused_adam": [], + "deepspeed.ops.op_builder.cpu.no_impl": [], + "deepspeed.ops.op_builder.cpu_adagrad": [], + "deepspeed.ops.op_builder.cpu_adam": [], + "deepspeed.ops.op_builder.cpu_lion": [], + "deepspeed.ops.op_builder.evoformer_attn": [], + "deepspeed.ops.op_builder.fp_quantizer": [], + "deepspeed.ops.op_builder.fused_adam": [], + "deepspeed.ops.op_builder.fused_lamb": [], + "deepspeed.ops.op_builder.fused_lion": [], + "deepspeed.ops.op_builder.hpu": [], + "deepspeed.ops.op_builder.hpu.builder": [], + "deepspeed.ops.op_builder.hpu.cpu_adam": [], + "deepspeed.ops.op_builder.hpu.fused_adam": [], + "deepspeed.ops.op_builder.hpu.no_impl": [], + "deepspeed.ops.op_builder.inference_core_ops": [], + "deepspeed.ops.op_builder.inference_cutlass_builder": [], + "deepspeed.ops.op_builder.npu": [], + "deepspeed.ops.op_builder.npu.async_io": [], + "deepspeed.ops.op_builder.npu.builder": [], + "deepspeed.ops.op_builder.npu.cpu_adagrad": [], + "deepspeed.ops.op_builder.npu.cpu_adam": [], + "deepspeed.ops.op_builder.npu.cpu_lion": [], + "deepspeed.ops.op_builder.npu.fused_adam": [], + "deepspeed.ops.op_builder.npu.inference": [], + "deepspeed.ops.op_builder.npu.no_impl": [], + "deepspeed.ops.op_builder.quantizer": [], + "deepspeed.ops.op_builder.ragged_ops": [], + "deepspeed.ops.op_builder.ragged_utils": [], + "deepspeed.ops.op_builder.random_ltd": [], + "deepspeed.ops.op_builder.sparse_attn": [], + "deepspeed.ops.op_builder.spatial_inference": [], + "deepspeed.ops.op_builder.stochastic_transformer": [], + "deepspeed.ops.op_builder.transformer": [], + "deepspeed.ops.op_builder.transformer_inference": [], + "deepspeed.ops.op_builder.xpu": [], + "deepspeed.ops.op_builder.xpu.async_io": [], + "deepspeed.ops.op_builder.xpu.builder": [], + "deepspeed.ops.op_builder.xpu.cpu_adagrad": [], + "deepspeed.ops.op_builder.xpu.cpu_adam": [], + "deepspeed.ops.op_builder.xpu.fused_adam": [], + "deepspeed.ops.op_builder.xpu.packbits": [], + "deepspeed.ops.quantizer": [], + "deepspeed.ops.quantizer.quantizer": [], + "deepspeed.ops.random_ltd": [], + "deepspeed.ops.random_ltd.dropping_utils": [], + "deepspeed.ops.sparse_attention": [], + "deepspeed.ops.sparse_attention.bert_sparse_self_attention": [], + "deepspeed.ops.sparse_attention.matmul": [], + "deepspeed.ops.sparse_attention.softmax": [], + "deepspeed.ops.sparse_attention.sparse_attention_utils": [], + "deepspeed.ops.sparse_attention.sparse_self_attention": [], + "deepspeed.ops.sparse_attention.sparsity_config": [], + "deepspeed.ops.sparse_attention.trsrc": [], + "deepspeed.ops.transformer": [], + "deepspeed.ops.transformer.inference": [], + "deepspeed.ops.transformer.inference.bias_add": [], + "deepspeed.ops.transformer.inference.config": [], + "deepspeed.ops.transformer.inference.diffusers_2d_transformer": [], + "deepspeed.ops.transformer.inference.diffusers_attention": [], + "deepspeed.ops.transformer.inference.diffusers_transformer_block": [], + "deepspeed.ops.transformer.inference.ds_attention": [], + "deepspeed.ops.transformer.inference.ds_mlp": [], + "deepspeed.ops.transformer.inference.moe_inference": [], + "deepspeed.ops.transformer.inference.op_binding": [], + "deepspeed.ops.transformer.inference.op_binding.base": [], + "deepspeed.ops.transformer.inference.op_binding.gelu_gemm": [], + "deepspeed.ops.transformer.inference.op_binding.linear": [], + "deepspeed.ops.transformer.inference.op_binding.mlp_gemm": [], + "deepspeed.ops.transformer.inference.op_binding.qkv_gemm": [], + "deepspeed.ops.transformer.inference.op_binding.residual_add": [], + "deepspeed.ops.transformer.inference.op_binding.softmax": [], + "deepspeed.ops.transformer.inference.op_binding.softmax_context": [], + "deepspeed.ops.transformer.inference.op_binding.vector_matmul": [], + "deepspeed.ops.transformer.inference.triton": [], + "deepspeed.ops.transformer.inference.triton.attention": [], + "deepspeed.ops.transformer.inference.triton.gelu": [], + "deepspeed.ops.transformer.inference.triton.layer_norm": [], + "deepspeed.ops.transformer.inference.triton.matmul_ext": [], + "deepspeed.ops.transformer.inference.triton.mlp": [], + "deepspeed.ops.transformer.inference.triton.ops": [], + "deepspeed.ops.transformer.inference.triton.residual_add": [], + "deepspeed.ops.transformer.inference.triton.softmax": [], + "deepspeed.ops.transformer.inference.triton.triton_matmul_kernel": [], + "deepspeed.ops.transformer.inference.triton_ops": [], + "deepspeed.ops.transformer.transformer": [], + "deepspeed.pipe": [], + "deepspeed.profiling": [], + "deepspeed.profiling.config": [], + "deepspeed.profiling.constants": [], + "deepspeed.profiling.flops_profiler": [], + "deepspeed.profiling.flops_profiler.profiler": [], + "deepspeed.pydantic_v1": [], + "deepspeed.runtime": [], + "deepspeed.runtime.activation_checkpointing": [], + "deepspeed.runtime.activation_checkpointing.checkpointing": [], + "deepspeed.runtime.activation_checkpointing.config": [], + "deepspeed.runtime.base_optimizer": [], + "deepspeed.runtime.bf16_optimizer": [], + "deepspeed.runtime.checkpoint_engine": [], + "deepspeed.runtime.checkpoint_engine.checkpoint_engine": [], + "deepspeed.runtime.checkpoint_engine.nebula_checkpoint_engine": [], + "deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine": [], + "deepspeed.runtime.comm": [], + "deepspeed.runtime.comm.coalesced_collectives": [], + "deepspeed.runtime.comm.compressed": [], + "deepspeed.runtime.comm.hccl": [], + "deepspeed.runtime.comm.mpi": [], + "deepspeed.runtime.comm.nccl": [], + "deepspeed.runtime.compiler": [], + "deepspeed.runtime.compression": [], + "deepspeed.runtime.compression.cupy": [], + "deepspeed.runtime.config": [], + "deepspeed.runtime.config_utils": [], + "deepspeed.runtime.constants": [], + "deepspeed.runtime.data_pipeline": [], + "deepspeed.runtime.data_pipeline.config": [], + "deepspeed.runtime.data_pipeline.constants": [], + "deepspeed.runtime.data_pipeline.curriculum_scheduler": [], + "deepspeed.runtime.data_pipeline.data_routing": [], + "deepspeed.runtime.data_pipeline.data_routing.basic_layer": [], + "deepspeed.runtime.data_pipeline.data_routing.helper": [], + "deepspeed.runtime.data_pipeline.data_routing.scheduler": [], + "deepspeed.runtime.data_pipeline.data_routing.utils": [], + "deepspeed.runtime.data_pipeline.data_sampling": [], + "deepspeed.runtime.data_pipeline.data_sampling.data_analyzer": [], + "deepspeed.runtime.data_pipeline.data_sampling.data_sampler": [], + "deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset": [], + "deepspeed.runtime.data_pipeline.data_sampling.utils": [], + "deepspeed.runtime.dataloader": [], + "deepspeed.runtime.eigenvalue": [], + "deepspeed.runtime.engine": [], + "deepspeed.runtime.fp16": [], + "deepspeed.runtime.fp16.fused_optimizer": [], + "deepspeed.runtime.fp16.loss_scaler": [], + "deepspeed.runtime.fp16.onebit": [], + "deepspeed.runtime.fp16.onebit.adam": [], + "deepspeed.runtime.fp16.onebit.lamb": [], + "deepspeed.runtime.fp16.onebit.zoadam": [], + "deepspeed.runtime.fp16.unfused_optimizer": [], + "deepspeed.runtime.hybrid_engine": [], + "deepspeed.runtime.lr_schedules": [], + "deepspeed.runtime.pipe": [], + "deepspeed.runtime.pipe.engine": [], + "deepspeed.runtime.pipe.module": [], + "deepspeed.runtime.pipe.p2p": [], + "deepspeed.runtime.pipe.schedule": [], + "deepspeed.runtime.pipe.topology": [], + "deepspeed.runtime.progressive_layer_drop": [], + "deepspeed.runtime.quantize": [], + "deepspeed.runtime.sparse_tensor": [], + "deepspeed.runtime.state_dict_factory": [], + "deepspeed.runtime.swap_tensor": [], + "deepspeed.runtime.swap_tensor.aio_config": [], + "deepspeed.runtime.swap_tensor.async_swapper": [], + "deepspeed.runtime.swap_tensor.constants": [], + "deepspeed.runtime.swap_tensor.optimizer_utils": [], + "deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper": [], + "deepspeed.runtime.swap_tensor.partitioned_param_swapper": [], + "deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper": [], + "deepspeed.runtime.swap_tensor.utils": [], + "deepspeed.runtime.utils": [], + "deepspeed.runtime.weight_quantizer": [], + "deepspeed.runtime.zero": [], + "deepspeed.runtime.zero.config": [], + "deepspeed.runtime.zero.contiguous_memory_allocator": [], + "deepspeed.runtime.zero.linear": [], + "deepspeed.runtime.zero.mics": [], + "deepspeed.runtime.zero.mics_utils": [], + "deepspeed.runtime.zero.offload_config": [], + "deepspeed.runtime.zero.parameter_offload": [], + "deepspeed.runtime.zero.partition_parameters": [], + "deepspeed.runtime.zero.partitioned_param_coordinator": [], + "deepspeed.runtime.zero.partitioned_param_profiler": [], + "deepspeed.runtime.zero.stage3": [], + "deepspeed.runtime.zero.stage_1_and_2": [], + "deepspeed.runtime.zero.test": [], + "deepspeed.runtime.zero.tiling": [], + "deepspeed.runtime.zero.utils": [], + "deepspeed.sequence": [], + "deepspeed.sequence.layer": [], + "deepspeed.utils": [], + "deepspeed.utils.bwc": [], + "deepspeed.utils.comms_logging": [], + "deepspeed.utils.config": [], + "deepspeed.utils.debug": [], + "deepspeed.utils.exceptions": [], + "deepspeed.utils.groups": [], + "deepspeed.utils.init_on_device": [], + "deepspeed.utils.logging": [], + "deepspeed.utils.mixed_precision_linkage": [], + "deepspeed.utils.numa": [], + "deepspeed.utils.nvtx": [], + "deepspeed.utils.tensor_fragment": [], + "deepspeed.utils.timer": [], + "deepspeed.utils.torch": [], + "deepspeed.utils.types": [], + "deepspeed.utils.z3_leaf_module": [], + "deepspeed.utils.zero_to_fp32": [] + }, "delta-spark": { "delta": [], "delta._typing": [], From 5209d63b61e74173ab642f715a1a2afe10cb48e0 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:32:11 +0200 Subject: [PATCH 09/22] whitelist spark_price_transparency (#2003) ## Changes whitelist spark_price_transparency ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested --------- Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 902fda95f9..6a66138446 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -24623,6 +24623,183 @@ "spark-ocr": { "sparkocr": [] }, + "spark-price-transparency": { + "spark_price_transparency": [], + "spark_price_transparency.allowed_amounts": [], + "spark_price_transparency.allowed_amounts.aa_header": [], + "spark_price_transparency.allowed_amounts.aa_network": [], + "spark_price_transparency.allowed_amounts.curate_viz": [], + "spark_price_transparency.allowed_amounts.files": [], + "spark_price_transparency.allowed_amounts.out_amount": [], + "spark_price_transparency.allowed_amounts.out_code": [], + "spark_price_transparency.allowed_amounts.schema": [ + { + "code": "dbfs-usage", + "message": "Deprecated file system path: dbfs:/user/hive/warehouse/pt_raw.db/_raw/mth=" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.curate_viz": [], + "spark_price_transparency.end2end_job_viz": [], + "spark_price_transparency.end2end_viz": [], + "spark_price_transparency.entity_viz": [], + "spark_price_transparency.guide": [], + "spark_price_transparency.in_network": [], + "spark_price_transparency.in_network.inr_header": [], + "spark_price_transparency.in_network.inr_network": [], + "spark_price_transparency.in_network.inr_provider": [], + "spark_price_transparency.in_network_rates": [], + "spark_price_transparency.in_network_rates.curate_viz": [], + "spark_price_transparency.in_network_rates.files": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.in_network_rates.in_coverage": [], + "spark_price_transparency.in_network_rates.in_pr_loc": [], + "spark_price_transparency.in_network_rates.in_provider": [], + "spark_price_transparency.in_network_rates.in_rate": [], + "spark_price_transparency.in_network_rates.inr_header": [], + "spark_price_transparency.in_network_rates.inr_network": [], + "spark_price_transparency.in_network_rates.inr_provider": [], + "spark_price_transparency.in_network_rates.schema": [ + { + "code": "dbfs-usage", + "message": "Deprecated file system path: dbfs:/user/hive/warehouse/pt_raw.db/_raw/mth=" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.methods_viz": [], + "spark_price_transparency.provider_reference": [], + "spark_price_transparency.provider_reference.curate_viz": [], + "spark_price_transparency.provider_reference.files": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.provider_reference.in_provider": [], + "spark_price_transparency.provider_reference.in_rate": [], + "spark_price_transparency.provider_reference.pr_provider": [], + "spark_price_transparency.provider_reference.schema": [ + { + "code": "dbfs-usage", + "message": "Deprecated file system path: dbfs:/user/hive/warehouse/pt_raw.db/_raw/mth=" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.pt_analytic_table": [], + "spark_price_transparency.pt_files": [ + { + "code": "dbfs-usage", + "message": "Deprecated file system path: dbfs:/user/hive/warehouse/pt_raw.db/_raw/mth=" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.pt_functions": [], + "spark_price_transparency.pt_ingest_table": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "spark_price_transparency.pt_raw": [ + { + "code": "dbfs-usage", + "message": "Deprecated file system path: dbfs:/user/hive/warehouse/pt_raw.db" + } + ], + "spark_price_transparency.pt_schema": [ + { + "code": "dbfs-usage", + "message": "Deprecated file system path: dbfs:/tmp/pt/_checkpoint" + } + ], + "spark_price_transparency.pt_stage": [], + "spark_price_transparency.pt_table": [ + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.pt_types": [], + "spark_price_transparency.table": [ + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.table_analytic": [ + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.table_of_contents": [], + "spark_price_transparency.table_of_contents.curate_viz": [], + "spark_price_transparency.table_of_contents.files": [], + "spark_price_transparency.table_of_contents.index_reports": [], + "spark_price_transparency.table_of_contents.schema": [ + { + "code": "dbfs-usage", + "message": "Deprecated file system path: dbfs:/user/hive/warehouse/pt_raw.db/_raw/mth=" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.table_of_contents.toc_header": [], + "spark_price_transparency.table_of_contents.toc_reporting": [], + "spark_price_transparency.table_stream_tgt": [ + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "spark_price_transparency.unified_viz": [] + }, "sqlalchemy": { "sqlalchemy": [] }, From 6198a280df841dadf5332f835302ae5a6ef1c3b5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Jul 2024 09:34:13 +0200 Subject: [PATCH 10/22] Tidy up whitespace (#1923) ## Changes This PR follows on from #1778 by cleaning up files that use tabs, have trailing whitespace, or are missing EOL at the end of files. By fixing these in one go other PRs become less noisy. (Sample files used in tests have been left alone and are not affected by this PR.) --- .codegen.json | 2 +- .github/ISSUE_TEMPLATE/config.yml | 2 +- .github/ISSUE_TEMPLATE/feature.yml | 1 - .github/codecov.yml | 2 +- .github/dependabot.yml | 2 +- .github/pull_request_template.md | 2 +- .github/workflows/no-cheat.yml | 2 +- .github/workflows/release.yml | 8 +- docs/external_hms_glue.md | 22 ++--- docs/group_name_conflict.md | 4 +- docs/local-group-migration.md | 92 +++++++++---------- docs/table_upgrade.md | 88 +++++++++--------- docs/troubleshooting.md | 8 +- src/databricks/labs/ucx/mixins/README.md | 2 +- .../azure/05_0_azure_service_principals.sql | 2 +- .../estimates/00_0_metastore_assignment.md | 4 +- ..._4_is_incompatible_submit_run_detected.sql | 4 +- .../estimates/01_0_group_migration.md | 4 +- .../estimates/01_2_group_migration.sql | 2 +- .../estimates/02_0_data_modeling.md | 4 +- .../estimates/02_2_uc_data_modeling.sql | 2 +- .../02_5_uc_data_modeling_complexity.sql | 2 +- .../estimates/03_0_data_migration.md | 4 +- .../estimates/03_2_data_migration_summary.sql | 2 +- .../03_5_data_migration_complexity.sql | 2 +- .../interactive/00_0_interactive.md | 2 +- ...compute_access_mode_limitation_summary.sql | 20 ++-- .../interactive/02_0_cluster_summary.md | 2 +- .../interactive/03_0_cluster_summary.sql | 16 ++-- .../main/00_5_count_total_views.sql | 2 +- .../main/00___assessment_overview.md | 2 +- .../assessment/main/01_0_count_jobs.sql | 4 +- .../main/02_2_count_table_by_storage.sql | 2 +- .../assessment/main/05_0_object_readiness.sql | 4 +- .../main/05_2_assessment_summary.sql | 4 +- .../queries/assessment/main/10_0_all_udfs.sql | 2 +- .../assessment/main/10_0_database_summary.sql | 2 +- .../assessment/main/15_3_mount_points.sql | 2 +- .../assessment/main/20_0_cluster_policies.sql | 2 +- .../queries/assessment/main/20_0_clusters.sql | 2 +- .../assessment/main/30_3_job_details.sql | 2 +- .../ucx/queries/assessment/main/30_3_jobs.sql | 2 +- .../assessment/main/30_4_submit_runs.sql | 2 +- .../assessment/main/40_0_pipelines.sql | 2 +- .../ucx/queries/assessment/main/40_2_logs.sql | 2 +- .../main/40_3_global_init_scripts.sql | 2 +- .../ucx/queries/assessment/main/README.md | 2 +- .../main/01_0_data_object_migration_status.md | 4 +- .../01_1_data_object_migration_summary.sql | 2 +- .../main/02_1_code_compatibility_problems.sql | 2 +- .../main/02_1_data_reconciliation_summary.sql | 2 +- .../main/03_1_data_reconciliation_status.sql | 2 +- .../labs/ucx/queries/views/code_patterns.sql | 16 ++-- .../labs/ucx/queries/views/misc_patterns.sql | 6 +- .../queries/views/reconciliation_results.sql | 2 +- .../ucx/queries/views/table_estimates.sql | 2 +- .../labs/ucx/recon/data_comparator.py | 10 +- .../labs/ucx/recon/metadata_retriever.py | 10 +- ...e-spn-secret-interactive-multiple-spn.json | 2 +- .../assessment/clusters/azure-spn-secret.json | 2 +- .../clusters/init-scripts-dbfs.json | 2 +- .../clusters/init-scripts-file.json | 2 +- .../clusters/init-scripts-no-match.json | 2 +- .../unit/assessment/clusters/job-cluster.json | 2 +- .../clusters/legacy-passthrough.json | 2 +- .../assessment/clusters/no-isolation.json | 2 +- .../clusters/outdated-autoscale.json | 2 +- .../unit/assessment/clusters/passthrough.json | 2 +- .../clusters/policy-azure-oauth.json | 2 +- .../assessment/clusters/policy-deleted.json | 2 +- ...licy-single-user-with-empty-appid-spn.json | 2 +- .../clusters/policy-single-user-with-spn.json | 2 +- .../policy-spn-in-policy-overrides.json | 2 +- .../clusters/simplest-autoscale.json | 2 +- tests/unit/assessment/jobruns/dbt_task.json | 2 +- .../assessment/jobruns/gitsource_task.json | 2 +- tests/unit/assessment/jobruns/jar_task.json | 2 +- .../jobruns/notebook_dupe_task.json | 2 +- .../jobruns/notebook_no_failure_task.json | 2 +- .../jobruns/notebook_no_sec_comp_task.json | 2 +- .../jobruns/notebook_no_sec_no_comp_task.json | 2 +- .../jobruns/notebook_spark_conf_task.json | 2 +- .../assessment/jobruns/notebook_task.json | 2 +- .../assessment/jobruns/python_wheel_task.json | 2 +- .../jobruns/run_condition_task.json | 2 +- .../assessment/jobruns/spark_jar_task.json | 2 +- tests/unit/assessment/jobruns/sql_tasks.json | 2 +- .../jobs/legacy-job-on-azure-spn-secret.json | 2 +- tests/unit/assessment/jobs/no-settings.json | 2 +- tests/unit/assessment/jobs/no-tasks.json | 2 +- .../assessment/jobs/on-azure-spn-secret.json | 2 +- .../jobs/on-outdated-autoscale.json | 2 +- .../jobs/on-simplest-autoscale.json | 2 +- .../jobs/policy-single-job-with-spn.json | 2 +- tests/unit/assessment/jobs/single-job.json | 2 +- tests/unit/assessment/jobs/some-spn.json | 2 +- .../unit/assessment/jobs/spark-jar-task.json | 2 +- .../unit/assessment/pipelines/empty-spec.json | 2 +- .../assessment/pipelines/spec-with-spn.json | 2 +- tests/unit/assessment/policies/ext-hms.json | 2 +- .../policies/single-job-with-spn.json | 2 +- .../single-user-with-empty-appid-spn.json | 2 +- .../single-user-with-spn-no-sparkversion.json | 2 +- .../single-user-with-spn-policyid.json | 2 +- .../policies/single-user-with-spn.json | 2 +- .../policies/spn-in-policy-overrides.json | 2 +- .../warehouses/dupe-spn-config.json | 2 +- .../assessment/warehouses/single-config.json | 2 +- .../assessment/warehouses/spn-config.json | 2 +- .../warehouses/spn-secret-config.json | 2 +- tests/unit/azure/azure/mappings.json | 2 +- .../hive_metastore/tables/dbfs_parquet.json | 2 +- .../tables/external_hiveserde.json | 2 +- .../tables/external_no_sync.json | 2 +- .../external_no_sync_missing_location.json | 2 +- .../hive_metastore/tables/external_src.json | 2 +- .../tables/external_src_unsupported.json | 2 +- .../hive_metastore/tables/managed_dbfs.json | 2 +- .../hive_metastore/tables/managed_mnt.json | 2 +- .../hive_metastore/tables/managed_other.json | 2 +- .../tables/tables_and_views.json | 2 +- tests/unit/hive_metastore/tables/view.json | 2 +- ...3_SparkR_Fine Grained Demand Forecasting.r | 2 +- ucx.iml | 2 +- 124 files changed, 270 insertions(+), 271 deletions(-) diff --git a/.codegen.json b/.codegen.json index a00911e0b0..b2fd6857fd 100644 --- a/.codegen.json +++ b/.codegen.json @@ -11,4 +11,4 @@ "pytest -n 4 --cov src --cov-report=xml --timeout 30 tests/unit --durations 20" ] } -} \ No newline at end of file +} diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index fc9d764b44..8eeef6eb4a 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -3,7 +3,7 @@ contact_links: - name: General Databricks questions url: https://help.databricks.com/ about: Issues related to Databricks and not related to UCX - + - name: UCX Documentation url: https://github.com/databrickslabs/ucx/tree/main/docs about: Documentation about UCX diff --git a/.github/ISSUE_TEMPLATE/feature.yml b/.github/ISSUE_TEMPLATE/feature.yml index 8bf7ad146b..4572167772 100644 --- a/.github/ISSUE_TEMPLATE/feature.yml +++ b/.github/ISSUE_TEMPLATE/feature.yml @@ -33,4 +33,3 @@ body: description: Add any other context, references or screenshots about the feature request here. validations: required: false - \ No newline at end of file diff --git a/.github/codecov.yml b/.github/codecov.yml index aa8cf6b8d7..aaa25bf74e 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -7,4 +7,4 @@ coverage: patch: default: target: auto - threshold: 0.5% \ No newline at end of file + threshold: 0.5% diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 8c763bf215..b728efb6e0 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -7,4 +7,4 @@ updates: - package-ecosystem: "github-actions" directory: "/" schedule: - interval: "daily" \ No newline at end of file + interval: "daily" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 26db60bacf..40fd8e0780 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -7,7 +7,7 @@ Resolves #.. -### Functionality +### Functionality - [ ] added relevant user documentation - [ ] added new CLI command diff --git a/.github/workflows/no-cheat.yml b/.github/workflows/no-cheat.yml index 3e150d0eeb..c1864acf45 100644 --- a/.github/workflows/no-cheat.yml +++ b/.github/workflows/no-cheat.yml @@ -27,4 +27,4 @@ jobs: if [ "${CHEAT}" -ne 0 ]; then echo "Do not cheat the linter: ${CHEAT}" exit 1 - fi \ No newline at end of file + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0a0fb71ce8..d645b0c9fc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -22,12 +22,12 @@ jobs: cache: 'pip' cache-dependency-path: '**/pyproject.toml' python-version: '3.10' - + - name: Build wheels run: | pip install hatch==1.9.4 hatch build - + - name: Draft release uses: softprops/action-gh-release@v2 with: @@ -38,11 +38,11 @@ jobs: - uses: pypa/gh-action-pypi-publish@release/v1 name: Publish package distributions to PyPI - + - name: Sign artifacts with Sigstore uses: sigstore/gh-action-sigstore-python@v2.1.1 with: inputs: | dist/databricks_*.whl dist/databricks_*.tar.gz - release-signing-artifacts: true \ No newline at end of file + release-signing-artifacts: true diff --git a/docs/external_hms_glue.md b/docs/external_hms_glue.md index 4c1a149e44..69ca592cfc 100644 --- a/docs/external_hms_glue.md +++ b/docs/external_hms_glue.md @@ -10,14 +10,14 @@ External Hive Metastore Integration * [Additional Considerations](#additional-considerations) -UCX works with both the default workspace metastore, or an external Hive metastore. This document outlines the current +UCX works with both the default workspace metastore, or an external Hive metastore. This document outlines the current integration and how to set up UCX to work with your existing external metastore. # Installation The setup process follows the following steps -- UCX scan existing cluster policies, and Databricks SQL data access configuration for Spark configurations key that +- UCX scan existing cluster policies, and Databricks SQL data access configuration for Spark configurations key that enables external Hive metastore: - Spark config `spark.databricks.hive.metastore.glueCatalog.enabled=true` - for Glue Catalog - Spark config containing prefixes `spark.sql.hive.metastore` - for external Hive metastore @@ -25,15 +25,15 @@ enables external Hive metastore: _We have identified one or more cluster policies set up for an external metastore. Would you like to set UCX to connect to the external metastore?_ - Selecting **Yes** will display a list of the matching policies and allow the user to select the appropriate policies. -- The chosen policy will be used as the template to set up UCX job clusters via a new policy. UCX will clone the +- The chosen policy will be used as the template to set up UCX job clusters via a new policy. UCX will clone the necessary Spark configurations and data access configurations, e.g. Instance Profile over to this new policy. - When prompted for an inventory database, please specify a new name instead of the default `ucx` to avoid conflict. This is because the inventory database will be created in the external metastore, which is shared across multiple workspaces. -- UCX **DOES NOT** update the data access configuration for SQL Warehouses. This is because Databricks SQL settings apply +- UCX **DOES NOT** update the data access configuration for SQL Warehouses. This is because Databricks SQL settings apply to all warehouses in a workspace, and can introduce unexpected changes to existing workload. **Note** -As UCX uses both job clusters and SQL Warehouses, it is important to ensure that both are configured to use the same +As UCX uses both job clusters and SQL Warehouses, it is important to ensure that both are configured to use the same external Hive metastore. If the SQL Warehouses are not configured for external Hive metastore, please manually update the data access configuration. See [Enable data access configuration](https://learn.microsoft.com/en-us/azure/databricks/admin/sql/data-access-configuration) for more details @@ -41,7 +41,7 @@ the data access configuration. See [Enable data access configuration](https://le # Manual Override -If the workspace does not have a cluster policy or SQL data access configuration for external Hive metastore, there are +If the workspace does not have a cluster policy or SQL data access configuration for external Hive metastore, there are two options to manually enable this: - *Pre-installation*: create a custer policy with the appropriate Spark configuration and data access for external metastore: - See the following documentation pages for more details: [Glue catalog](https://docs.databricks.com/en/archive/external-metastores/aws-glue-metastore.html) and [External Hive Metastore](https://learn.microsoft.com/en-us/azure/databricks/archive/external-metastores/external-hive-metastore). @@ -70,13 +70,13 @@ following the post-installation steps above. # Assessment Workflow -Once UCX is set up with external Hive metastore the assessment workflow will scan tables & views from the external +Once UCX is set up with external Hive metastore the assessment workflow will scan tables & views from the external Hive metastore instead of the default workspace metastore. If the external Hive metastore is shared between multiple workspaces, please specify a different inventory database name for each UCX installation. This is to avoid conflicts between the inventory databases. -As the inventory database is stored in the external Hive metastore, it can only be queried from a cluster or SQL warehouse +As the inventory database is stored in the external Hive metastore, it can only be queried from a cluster or SQL warehouse with external Hive metastore configuration. The assessment dashboard will also fail if the SQL warehouse is not configured correctly. [[back to top](#external-hive-metastore-integration)] @@ -91,14 +91,14 @@ metastore is redundant and will be a no-op. # Additional Considerations -If a workspace is set up with multiple external Hive metastores, you will need to plan the approach carefully. Below are +If a workspace is set up with multiple external Hive metastores, you will need to plan the approach carefully. Below are a few considerations to keep in mind: - You can have multiple UCX installations in a workspace, each set up with a different external Hive metastore. As the SQL data access configuration is shared across the entire workspace, you will need to manually update them when running each UCX installation. - You can uninstall UCX and reinstall it with a different external Hive metastore. This still requires manual updates to the SQL data access configuration, but it is a cleaner approach. -- You can manually modify the cluster policy and SQL data access configuration to point to the correct external Hive +- You can manually modify the cluster policy and SQL data access configuration to point to the correct external Hive metastore, after UCX has been installed. This is the most flexible approach, but requires manual intervention. -[[back to top](#external-hive-metastore-integration)] \ No newline at end of file +[[back to top](#external-hive-metastore-integration)] diff --git a/docs/group_name_conflict.md b/docs/group_name_conflict.md index 07d022f7e8..d9223434ca 100644 --- a/docs/group_name_conflict.md +++ b/docs/group_name_conflict.md @@ -19,7 +19,7 @@ Choose how to map the workspace groups: [3] Match by External ID [4] Regex Substitution [5] Regex Matching -Enter a number between 0 and 5: +Enter a number between 0 and 5: ``` The user then input the Prefix/Suffix/Regular Expression. @@ -41,4 +41,4 @@ Group Translation Scenarios: | Prefix | prefix: [Prefix] | ^ | [Prefix] | [EMPTY] | data_engineers --> prod_data_engineers | | Suffix | suffix: [Prefix] | $ | [Suffix] | [EMPTY] | data_engineers --> data_engineers_prod | | Substitution | Search Regex: [Regex]
Replace Text:[Replacement_Text] | [WS_Regex] | [ [Replacement_Text] | [Empty] | corp_tech_data_engineers --> prod_data_engineers | -| Partial Lookup | Workspace Regex: [WS_Regex]
Account Regex: [Acct Regex] | [WS_Regex] | [Empty] | [Acct_Regex] | data_engineers(12345) --> data_engs(12345) | \ No newline at end of file +| Partial Lookup | Workspace Regex: [WS_Regex]
Account Regex: [Acct Regex] | [WS_Regex] | [Empty] | [Acct_Regex] | data_engineers(12345) --> data_engs(12345) | diff --git a/docs/local-group-migration.md b/docs/local-group-migration.md index 9323a604c9..74ea243049 100644 --- a/docs/local-group-migration.md +++ b/docs/local-group-migration.md @@ -15,16 +15,16 @@ Workspace Group Migration * [Troubleshooting](#troubleshooting) -This feature introduces the ability to migrate groups from workspace level to account level in +This feature introduces the ability to migrate groups from workspace level to account level in the [group migration workflow](../README.md#group-migration-workflow). It helps you to upgrade all Databricks workspace assets: -Legacy Table ACLs, Entitlements, AWS instance profiles, Clusters, Cluster policies, Instance Pools, +Legacy Table ACLs, Entitlements, AWS instance profiles, Clusters, Cluster policies, Instance Pools, Databricks SQL warehouses, Delta Live Tables, Jobs, MLflow experiments, MLflow registry, SQL Dashboards & Queries, SQL Alerts, Token and Password usage permissions that are set on the workspace level, Secret scopes, Notebooks, -Directories, Repos, and Files. +Directories, Repos, and Files. -It ensures that all the necessary groups are available in the workspace with the correct permissions, and removes any unnecessary groups and permissions. -The tasks in the group migration workflow depend on the output of the assessment workflow and can be executed in sequence to ensure a successful migration. -The output of each task is stored in Delta tables in the `$inventory_database` schema. +It ensures that all the necessary groups are available in the workspace with the correct permissions, and removes any unnecessary groups and permissions. +The tasks in the group migration workflow depend on the output of the assessment workflow and can be executed in sequence to ensure a successful migration. +The output of each task is stored in Delta tables in the `$inventory_database` schema. The group migration workflow can be executed multiple times to ensure that all the groups are migrated successfully and that all the necessary permissions are assigned. @@ -39,45 +39,45 @@ The group migration workflow can be executed multiple times to ensure that all t # Design -`MigratedGroup` class represents a group that has been migrated from one name to another and stores information about -the original and new names, as well as the group's members, external ID, and roles. The `MigrationState` class holds -the state of the migration process and provides methods for getting the target principal and temporary name for a given +`MigratedGroup` class represents a group that has been migrated from one name to another and stores information about +the original and new names, as well as the group's members, external ID, and roles. The `MigrationState` class holds +the state of the migration process and provides methods for getting the target principal and temporary name for a given group name. [[back to top](#workspace-group-migration)] ## Group Manager -The `GroupManager` class is a `CrawlerBase` subclass that manages groups in a Databricks workspace. It provides methods -for renaming groups, reflecting account groups on the workspace, deleting original workspace groups, and validating -group membership. The class also provides methods for listing workspace and account groups, getting group details, and +The `GroupManager` class is a `CrawlerBase` subclass that manages groups in a Databricks workspace. It provides methods +for renaming groups, reflecting account groups on the workspace, deleting original workspace groups, and validating +group membership. The class also provides methods for listing workspace and account groups, getting group details, and deleting groups. -The `GroupMigrationStrategy` abstract base class defines the interface for a strategy that generates a list -of `MigratedGroup` objects based on a mapping between workspace and account groups. -The `MatchingNamesStrategy`, `MatchByExternalIdStrategy`, `RegexSubStrategy`, and `RegexMatchStrategy` classes are +The `GroupMigrationStrategy` abstract base class defines the interface for a strategy that generates a list +of `MigratedGroup` objects based on a mapping between workspace and account groups. +The `MatchingNamesStrategy`, `MatchByExternalIdStrategy`, `RegexSubStrategy`, and `RegexMatchStrategy` classes are concrete implementations of this interface. See [group name conflicts](group_name_conflict.md) for more details. -The `ConfigureGroups` class provides a command-line interface for configuring the group migration process during [installation](../README.md#installation). -It prompts the user to enter information about the group migration strategy, such as the renamed group prefix, regular expressions -for matching and substitution, and a list of groups to migrate. The class also provides methods for validating user input +The `ConfigureGroups` class provides a command-line interface for configuring the group migration process during [installation](../README.md#installation). +It prompts the user to enter information about the group migration strategy, such as the renamed group prefix, regular expressions +for matching and substitution, and a list of groups to migrate. The class also provides methods for validating user input and setting class variables based on the user's responses. [[back to top](#workspace-group-migration)] ## Permission Manager -It enables to crawl, save, and apply permissions for [clusters](#generic-permissions), -[tables and UDFs (User-Defined Functions)](#legacy-table-access-controls), [secret scopes](#secret-scope-permissions), +It enables to crawl, save, and apply permissions for [clusters](#generic-permissions), +[tables and UDFs (User-Defined Functions)](#legacy-table-access-controls), [secret scopes](#secret-scope-permissions), [entitlements](#entitlements-and-roles), and [dashboards](#dashboard-permissions). -To use the module, you can create a `PermissionManager` instance by calling the `factory` method, which sets up -the necessary [`AclSupport` objects](#acl-support) for different types of objects in the workspace. Once the instance -is created, you can call the `inventorize_permissions` method to crawl and save the permissions for all objects to +To use the module, you can create a `PermissionManager` instance by calling the `factory` method, which sets up +the necessary [`AclSupport` objects](#acl-support) for different types of objects in the workspace. Once the instance +is created, you can call the `inventorize_permissions` method to crawl and save the permissions for all objects to the inventory database in the `permissions` table. -The `apply_group_permissions` method allows you to apply the permissions to a list of account groups, while -the [`verify_group_permissions` method](../README.md#validate-groups-membership-command) verifies that the permissions are valid. +The `apply_group_permissions` method allows you to apply the permissions to a list of account groups, while +the [`verify_group_permissions` method](../README.md#validate-groups-membership-command) verifies that the permissions are valid. [[back to top](#workspace-group-migration)] @@ -90,15 +90,15 @@ The `AclSupport` objects define how to crawl, save, and apply permissions for sp * `get_verify_task`: A method that returns a callable that verifies that the permissions for a given `Permissions` object are applied correctly to the destination group. This method can be used to ensure that permissions are applied as expected, helping to improve the reliability and security of your Databricks workspace. * `object_types`: An abstract method that returns a set of strings representing the object types that the `AclSupport` instance supports. This method should be implemented to provide the necessary information about the object types supported by the `AclSupport` class. -The `Permissions` dataclass is used to represent the permissions for a specific object type and ID. The dataclass includes a `raw` attribute +The `Permissions` dataclass is used to represent the permissions for a specific object type and ID. The dataclass includes a `raw` attribute that contains the raw permission data as a string, providing a convenient way to work with the underlying permission data. [[back to top](#workspace-group-migration)] ### Generic Permissions -The `GenericPermissionsSupport` class is a concrete implementation of the [`AclSupport` interface](#acl-support) for -migrating permissions on various objects in a Databricks workspace. It is designed to be flexible and support almost any +The `GenericPermissionsSupport` class is a concrete implementation of the [`AclSupport` interface](#acl-support) for +migrating permissions on various objects in a Databricks workspace. It is designed to be flexible and support almost any object type in the workspace: - clusters @@ -116,18 +116,18 @@ object type in the workspace: - notebooks - workspace folders -It takes in an instance of the `WorkspaceClient` class, a list of `Listing` objects, and a `verify_timeout` parameter in -its constructor. The `Listing` objects are responsible for listing the objects in the workspace, and +It takes in an instance of the `WorkspaceClient` class, a list of `Listing` objects, and a `verify_timeout` parameter in +its constructor. The `Listing` objects are responsible for listing the objects in the workspace, and the `GenericPermissionsSupport` class uses these listings to crawl the ACL permissions for each object. -The `_apply_grant` method applies the ACL permission to the target principal in the database, and the `_verify` method -checks if the ACL permission in the `Grant` object matches the ACL permission for that object and principal in the database. -If the ACL permission does not match, the method raises a `ValueError` with an error message. The `get_verify_task` method -takes in a `Permissions` object and returns a callable object that calls the `_verify` method with the object type, +The `_apply_grant` method applies the ACL permission to the target principal in the database, and the `_verify` method +checks if the ACL permission in the `Grant` object matches the ACL permission for that object and principal in the database. +If the ACL permission does not match, the method raises a `ValueError` with an error message. The `get_verify_task` method +takes in a `Permissions` object and returns a callable object that calls the `_verify` method with the object type, object ID, and `Grant` object from the `Permissions` object. -he `_safe_get_permissions` and `_safe_updatepermissions` methods are used to safely get and update the permissions for -a given object type and ID, respectively. These methods handle exceptions that may occur during the API calls and log +he `_safe_get_permissions` and `_safe_updatepermissions` methods are used to safely get and update the permissions for +a given object type and ID, respectively. These methods handle exceptions that may occur during the API calls and log appropriate warning messages. [[back to top](#workspace-group-migration)] @@ -140,8 +140,8 @@ Reflected in [RedashPermissionsSupport](../src/databricks/labs/ucx/workspace_acc ### Entitlements and Roles -The `ScimSupport` is [`AclSupport`](#acl-support) that creates a snapshot of all the groups in the workspace, including their display name, id, meta, roles, and entitlements. -The `_is_item_relevant` method checks if a permission item is relevant to the current migration state. The `get_crawler_tasks` method returns an iterator of partial functions +The `ScimSupport` is [`AclSupport`](#acl-support) that creates a snapshot of all the groups in the workspace, including their display name, id, meta, roles, and entitlements. +The `_is_item_relevant` method checks if a permission item is relevant to the current migration state. The `get_crawler_tasks` method returns an iterator of partial functions for crawling the permissions of each group in the snapshot. It checks if the group has any roles or entitlements and returns a partial function to crawl the corresponding property. See [examples](../tests/integration/workspace_access/test_scim.py) for more details on how to use it as a library. @@ -150,9 +150,9 @@ See [examples](../tests/integration/workspace_access/test_scim.py) for more deta ### Secret Scope Permissions -`SecretScopesSupport` is a concrete implementation of the [`AclSupport` interface](#acl-support) for crawling ACLs of -all secret scopes, applying and verifying ACLs, and checking if a `Permissions` object is relevant to the current -migration state. It simplifies the process of managing permissions on secret scopes by checking if the ACLs have been +`SecretScopesSupport` is a concrete implementation of the [`AclSupport` interface](#acl-support) for crawling ACLs of +all secret scopes, applying and verifying ACLs, and checking if a `Permissions` object is relevant to the current +migration state. It simplifies the process of managing permissions on secret scopes by checking if the ACLs have been applied correctly, and if not, automatically reapplying them. [[back to top](#workspace-group-migration)] @@ -160,11 +160,11 @@ applied correctly, and if not, automatically reapplying them. ### Legacy Table Access Controls The `TableAclSupport` class is initialized with an instance of `GrantsCrawler` and `SqlBackend` classes, along with a `verify_timeout` parameter. -The class offers methods for crawling table ACL permissions, applying and verifying ACL permissions, and checking if a `Permissions` object is relevant to the current migration state. -The `get_crawler_tasks` method returns an iterator of callable objects, each of which returns a `Permissions` object for a specific table ACL permission. +The class offers methods for crawling table ACL permissions, applying and verifying ACL permissions, and checking if a `Permissions` object is relevant to the current migration state. +The `get_crawler_tasks` method returns an iterator of callable objects, each of which returns a `Permissions` object for a specific table ACL permission. The `_from_reduced` method creates a `Grant` object for each set of folded actions, and the `get_apply_task` method applies the ACL permission in the `Permissions` object to the target principal in the `MigrationState` object. -Furthermore, the `_apply_grant` method applies the ACL permission to the target principal in the database, while the `_verify` method checks if the ACL permission in -the `Grant` object matches the ACL permission for that object and principal in the database. The `get_verify_task` method calls the `_verify` method with the object type, +Furthermore, the `_apply_grant` method applies the ACL permission to the target principal in the database, while the `_verify` method checks if the ACL permission in +the `Grant` object matches the ACL permission for that object and principal in the database. The `get_verify_task` method calls the `_verify` method with the object type, object ID, and `Grant` object from the `Permissions` object. [[back to top](#workspace-group-migration)] @@ -291,4 +291,4 @@ finally: file_handler.close() ``` -[[back to top](#workspace-group-migration)] \ No newline at end of file +[[back to top](#workspace-group-migration)] diff --git a/docs/table_upgrade.md b/docs/table_upgrade.md index 0e9a2279a7..315f1e144d 100644 --- a/docs/table_upgrade.md +++ b/docs/table_upgrade.md @@ -162,16 +162,16 @@ upgrade. This feedback is presented in the migration dashboard: ## Data Access Permissions -The code provided is a Python module that defines a `Grant` dataclass and a `GrantsCrawler` class. The `Grant` dataclass -represents a grant of privileges in a database system, with attributes for the principal, action type, catalog, database, -table, view, UDF, and flags for any file and anonymous function. The `GrantsCrawler` class is a crawler that fetches grants -for databases, tables, views, UDFs, and anonymous functions in a Hive metastore. - -It uses a `TablesCrawler` and `UdfsCrawler` to fetch table and UDF information, respectively. The `GrantsCrawler` class -provides methods for fetching grants based on different parameters and returning them as an iterable of `Grant` objects. -It also provides methods for getting grants for a specific table or schema. The code includes a `_type_and_key` method -that normalizes the input parameters and returns a tuple of the object type and key, which is used to fetch grants for -the specified object. The code also includes methods for generating SQL statements to grant and revoke privileges in +The code provided is a Python module that defines a `Grant` dataclass and a `GrantsCrawler` class. The `Grant` dataclass +represents a grant of privileges in a database system, with attributes for the principal, action type, catalog, database, +table, view, UDF, and flags for any file and anonymous function. The `GrantsCrawler` class is a crawler that fetches grants +for databases, tables, views, UDFs, and anonymous functions in a Hive metastore. + +It uses a `TablesCrawler` and `UdfsCrawler` to fetch table and UDF information, respectively. The `GrantsCrawler` class +provides methods for fetching grants based on different parameters and returning them as an iterable of `Grant` objects. +It also provides methods for getting grants for a specific table or schema. The code includes a `_type_and_key` method +that normalizes the input parameters and returns a tuple of the object type and key, which is used to fetch grants for +the specified object. The code also includes methods for generating SQL statements to grant and revoke privileges in Hive and Unity Catalog (UC) systems. [[back to top](#table-upgrade)] @@ -180,86 +180,86 @@ Hive and Unity Catalog (UC) systems. The module includes two classes, `ExternalLocations` and `Mounts`, which inherit from `CrawlerBase`. -`ExternalLocations` is a class for crawling and managing external locations used by tables in a Databricks workspace. -It has methods for creating a list of external locations based on tables in a given schema and a method for generating +`ExternalLocations` is a class for crawling and managing external locations used by tables in a Databricks workspace. +It has methods for creating a list of external locations based on tables in a given schema and a method for generating Terraform definitions for any missing external locations. The class has a `_external_locations` method that filters and p rocesses the external locations based on certain conditions. -`Mounts` is a class for managing mounts in a Databricks workspace. It has methods for listing and deduplicating mounts, -as well as a method for creating a snapshot of the current mounts. The `_deduplicate_mounts` method removes any duplicate +`Mounts` is a class for managing mounts in a Databricks workspace. It has methods for listing and deduplicating mounts, +as well as a method for creating a snapshot of the current mounts. The `_deduplicate_mounts` method removes any duplicate mounts based on their name and source. [[back to top](#table-upgrade)] ## Table Mapping -The module includes two dataclasses, `Rule` and `TableToMigrate`, which encapsulate information about the source and target tables for migration. -The `Rule` dataclass includes information about the source and target catalog, schema, and table names, as well as a method for generating -the unique key for the target table in the Unity Catalog (UC) and the Hive Metastore (HMS). The `TableToMigrate` dataclass includes +The module includes two dataclasses, `Rule` and `TableToMigrate`, which encapsulate information about the source and target tables for migration. +The `Rule` dataclass includes information about the source and target catalog, schema, and table names, as well as a method for generating +the unique key for the target table in the Unity Catalog (UC) and the Hive Metastore (HMS). The `TableToMigrate` dataclass includes a `Table` object representing the source table and a `Rule` object representing the migration rule for that table. -At the heart of the module is the `TableMapping` class, which is the main class for managing table mappings. -The `TableMapping` class includes several methods for managing the table mappings, such as loading and saving -the mappings to a file, skipping tables and schemas, and checking if a table is already migrated or marked to be skipped. -The `TableMapping` class is initialized with an `Installation` object, a `WorkspaceClient` object, and a `SqlBackend` object, +At the heart of the module is the `TableMapping` class, which is the main class for managing table mappings. +The `TableMapping` class includes several methods for managing the table mappings, such as loading and saving +the mappings to a file, skipping tables and schemas, and checking if a table is already migrated or marked to be skipped. +The `TableMapping` class is initialized with an `Installation` object, a `WorkspaceClient` object, and a `SqlBackend` object, which are used to interact with the Unity Catalog, the workspace, and to execute SQL queries. [[back to top](#table-upgrade)] ## Migrating Tables -The `TablesMigrate` class is designed for migrating tables from one schema to another within a Databricks workspace. -This class requires instances of `TablesCrawler`, `WorkspaceClient`, `SqlBackend`, and `TableMapping` as inputs. -The `migrate_tables` method is responsible for migrating tables and takes an optional argument `what` to filter tables -based on their type. This method internally calls the `_migrate_table` method which is responsible for migrating +The `TablesMigrate` class is designed for migrating tables from one schema to another within a Databricks workspace. +This class requires instances of `TablesCrawler`, `WorkspaceClient`, `SqlBackend`, and `TableMapping` as inputs. +The `migrate_tables` method is responsible for migrating tables and takes an optional argument `what` to filter tables +based on their type. This method internally calls the `_migrate_table` method which is responsible for migrating the actual table and determining the appropriate migration method based on the table's type. -The `_migrate_external_table`, `_migrate_dbfs_root_table`, and `_migrate_view` methods are used to migrate external -tables, DBFS root tables, and views, respectively. The `_init_seen_tables`, `_table_already_upgraded`, `_get_tables_to_revert`, -and `_revert_migrated_table` methods are used for managing the state of the migration process. The `_init_seen_tables` method -initializes the list of tables that have been seen during the migration process. The `_table_already_upgraded` method checks +The `_migrate_external_table`, `_migrate_dbfs_root_table`, and `_migrate_view` methods are used to migrate external +tables, DBFS root tables, and views, respectively. The `_init_seen_tables`, `_table_already_upgraded`, `_get_tables_to_revert`, +and `_revert_migrated_table` methods are used for managing the state of the migration process. The `_init_seen_tables` method +initializes the list of tables that have been seen during the migration process. The `_table_already_upgraded` method checks if a table has already been upgraded. The `_get_tables_to_revert` method retrieves the list of tables that can be reverted. The `_revert_migrated_table` method is responsible for reverting the migration of a table. -The `is_upgraded` method checks if a table has been upgraded or not. The `print_revert_report` method generates a report +The `is_upgraded` method checks if a table has been upgraded or not. The `print_revert_report` method generates a report of the tables that can be reverted. [[back to top](#table-upgrade)] ## Moving tables -The `TableMove` class is a newly developed feature that enables the movement or aliasing of tables and views from one -schema to another within UC. This class requires an instance of `WorkspaceClient` and `SqlBackend` as inputs and provides -two primary methods: `move_tables` and `alias_tables`. The `move_tables` method moves tables to a new schema, while +The `TableMove` class is a newly developed feature that enables the movement or aliasing of tables and views from one +schema to another within UC. This class requires an instance of `WorkspaceClient` and `SqlBackend` as inputs and provides +two primary methods: `move_tables` and `alias_tables`. The `move_tables` method moves tables to a new schema, while the `alias_tables` method creates aliases of tables and views in a different schema. The `_move_table`, `_alias_table`, and `_move_view` methods are responsible for performing the actual movement, aliasing, -and recreating of the table or view in the destination schema, taking into account any dependencies or permissions -associated with the object. The `_reapply_grants` method reapplies the grants on the migrated table or view, ensuring -that the necessary permissions are maintained. The `_recreate_table` and `_recreate_view` methods recreate the table or +and recreating of the table or view in the destination schema, taking into account any dependencies or permissions +associated with the object. The `_reapply_grants` method reapplies the grants on the migrated table or view, ensuring +that the necessary permissions are maintained. The `_recreate_table` and `_recreate_view` methods recreate the table or view in the destination schema, including any dependencies or permissions associated with the object. [[back to top](#table-upgrade)] ## Table Size Estimation -The Table Size Crawler is a new feature of the data crawler system that calculates the size of tables in a Hive Metastore. -The `TableSizeCrawler` class is developed to inherit from `CrawlerBase` and is initialized with a SQL Execution Backend -and a schema name. This class uses the `TablesCrawler` class to obtain a snapshot of tables and then iterates over them +The Table Size Crawler is a new feature of the data crawler system that calculates the size of tables in a Hive Metastore. +The `TableSizeCrawler` class is developed to inherit from `CrawlerBase` and is initialized with a SQL Execution Backend +and a schema name. This class uses the `TablesCrawler` class to obtain a snapshot of tables and then iterates over them to calculate the size of each table using the `_safe_get_table_size` method which queries the Spark SQL engine. -The `TableSizeCrawler` class has several methods, including `snapshot`, `_try_load`, and `_crawl`. The `snapshot` method +The `TableSizeCrawler` class has several methods, including `snapshot`, `_try_load`, and `_crawl`. The `snapshot` method returns a list of `TableSize` objects representing the snapshot of tables, filtered to include only those with a non-null -size. The `_try_load` method tries to load table information from the database and raises a `TABLE_OR_VIEW_NOT_FOUND` -error if the table cannot be found. The `_crawl` method crawls and lists tables using the `tables_crawler` object and +size. The `_try_load` method tries to load table information from the database and raises a `TABLE_OR_VIEW_NOT_FOUND` +error if the table cannot be found. The `_crawl` method crawls and lists tables using the `tables_crawler` object and calculates the size of DBFS root tables, skipping tables that are not of type `TABLE` or are not DBFS root tables. [[back to top](#table-upgrade)] ## Table Crawler -The `TablesCrawler` is designed for crawling and listing tables within Hive Metastore. It can fetch detailed information -about each table, including the table's name, external location, and storage format. This information can be used to +The `TablesCrawler` is designed for crawling and listing tables within Hive Metastore. It can fetch detailed information +about each table, including the table's name, external location, and storage format. This information can be used to better understand the structure and contents of the tables in the Databricks workspace. [[back to top](#table-upgrade)] diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 9115543e2b..db85db5b0c 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -97,7 +97,7 @@ logs ``` ### Reading log files -Open the downloaded log files in a text editor or viewer. +Open the downloaded log files in a text editor or viewer. VSCode is an excellent example as it will allow you to search the entire folder for `ERROR` @@ -132,7 +132,7 @@ If Azure CLI has already been installed and authenticated, but you see the follo `14:50:33 ERROR [d.labs.ucx] In order to obtain AAD token, Please run azure cli to authenticate.` -Resolve this in macOS by running the command with an explicit auth type set: `DATABRICKS_AUTH_TYPE=azure-cli databricks labs ucx ...`. +Resolve this in macOS by running the command with an explicit auth type set: `DATABRICKS_AUTH_TYPE=azure-cli databricks labs ucx ...`. To resolve this issue in Windows, proceed with the following steps: 1. Open `%userprofile%` (the path like `C:\Users\`) @@ -143,7 +143,7 @@ To resolve this issue in Windows, proceed with the following steps: ### Resolving common errors on UCX install #### Error on installing the ucx inventory database -Your platform administrators may have implemented policies in one manner or another to prevent arbitrary database creation. +Your platform administrators may have implemented policies in one manner or another to prevent arbitrary database creation. - You may be prohibited from creating a database with a default location to `dbfs:/`. - You may be required to create a database on an external Hive Metastore (HMS) and need compute configured to do so. @@ -169,4 +169,4 @@ See the gathering log information sections elsewhere in this document. ### Resolving other common errors - If you have an external Hive Metastore (HMS) such as Glue Catalog or a MySQL, Postgres or SQL server database, please consult the [External Hive Metastore Integration guide](external_hms_glue.md) - If you are running table upgrade commands and workflows. Please consult the [Table Upgrade guide](table_upgrade.md) -- If you are trying to understand the Assessment report, please consult the [Assessment documentation](assessment.md) \ No newline at end of file +- If you are trying to understand the Assessment report, please consult the [Assessment documentation](assessment.md) diff --git a/src/databricks/labs/ucx/mixins/README.md b/src/databricks/labs/ucx/mixins/README.md index 2110e37f75..37fc37912c 100644 --- a/src/databricks/labs/ucx/mixins/README.md +++ b/src/databricks/labs/ucx/mixins/README.md @@ -1 +1 @@ -This file contains code that can potentially get backported to Databricks SDK for Python \ No newline at end of file +This file contains code that can potentially get backported to Databricks SDK for Python diff --git a/src/databricks/labs/ucx/queries/assessment/azure/05_0_azure_service_principals.sql b/src/databricks/labs/ucx/queries/assessment/azure/05_0_azure_service_principals.sql index 5f6970ea13..9d258c5813 100644 --- a/src/databricks/labs/ucx/queries/assessment/azure/05_0_azure_service_principals.sql +++ b/src/databricks/labs/ucx/queries/assessment/azure/05_0_azure_service_principals.sql @@ -7,4 +7,4 @@ select if(tenant_id = '', "NA", tenant_id) tenant_id, if(storage_account = '', "NA", storage_account) storage_account from - $inventory.azure_service_principals \ No newline at end of file + $inventory.azure_service_principals diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/00_0_metastore_assignment.md b/src/databricks/labs/ucx/queries/assessment/estimates/00_0_metastore_assignment.md index 38bd7aa7e0..609f651a1c 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/00_0_metastore_assignment.md +++ b/src/databricks/labs/ucx/queries/assessment/estimates/00_0_metastore_assignment.md @@ -8,8 +8,8 @@ This section assumes that your workspace has been attached to a UC metastore, it If you haven't created a metastore yet, follow the docs below to attach your workspace to the metastore: -[[AWS]](https://docs.databricks.com/en/data-governance/unity-catalog/enable-workspaces.html) -[[Azure]](https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/enable-workspaces) +[[AWS]](https://docs.databricks.com/en/data-governance/unity-catalog/enable-workspaces.html) +[[Azure]](https://learn.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/enable-workspaces) [[GCP]](https://docs.gcp.databricks.com/data-governance/unity-catalog/enable-workspaces.html) If any incompatible submit runs has been detected, follow the steps highlighted below: diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/00_4_is_incompatible_submit_run_detected.sql b/src/databricks/labs/ucx/queries/assessment/estimates/00_4_is_incompatible_submit_run_detected.sql index 4ac37ca560..2101c967f1 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/00_4_is_incompatible_submit_run_detected.sql +++ b/src/databricks/labs/ucx/queries/assessment/estimates/00_4_is_incompatible_submit_run_detected.sql @@ -2,7 +2,7 @@ -- widget title=Incompatible submit runs, row=0, col=2, size_x=4, size_y=8 SELECT * FROM (SELECT object_type, object_id, EXPLODE(from_json(failures, 'array')) AS finding -FROM $inventory.objects) -WHERE finding = "no data security mode specified" +FROM $inventory.objects) +WHERE finding = "no data security mode specified" AND object_type = "submit_runs" diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/01_0_group_migration.md b/src/databricks/labs/ucx/queries/assessment/estimates/01_0_group_migration.md index 5bfa92ffb6..6dc4ab31fe 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/01_0_group_migration.md +++ b/src/databricks/labs/ucx/queries/assessment/estimates/01_0_group_migration.md @@ -2,7 +2,7 @@ ## 2 - Group migration -The second step of succesfully adopting UC if migrating your workspace local groups to the account. +The second step of succesfully adopting UC if migrating your workspace local groups to the account. This step is a relatively low risk as it's an additive operation, it won't disturb your currently running pipelines. Follow those steps in order to successfully migrate your groups to the account: @@ -18,4 +18,4 @@ If you're using an Identity Provider: 3. Trigger a sync from your IdP to the account 1. To validate that all groups are properly setup for the group migration, run [validate-group-membership](https://github.com/databrickslabs/ucx/blob/main/README.md#validate-groups-membership-command) -Once the account groups are setup, perform the group migration by using the Group migration workflow, more information in the [docs](https://github.com/databrickslabs/ucx/blob/main/README.md#group-migration-workflow) \ No newline at end of file +Once the account groups are setup, perform the group migration by using the Group migration workflow, more information in the [docs](https://github.com/databrickslabs/ucx/blob/main/README.md#group-migration-workflow) diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/01_2_group_migration.sql b/src/databricks/labs/ucx/queries/assessment/estimates/01_2_group_migration.sql index db32768594..1adb2d549a 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/01_2_group_migration.sql +++ b/src/databricks/labs/ucx/queries/assessment/estimates/01_2_group_migration.sql @@ -1,3 +1,3 @@ -- viz type=table, name=Workspace local groups, columns=id_in_workspace,name_in_workspace,name_in_account,temporary_name,members,entitlements,external_id,roles -- widget title=Workspace local groups to migrate, row=1, col=2, size_x=3, size_y=8 -SELECT * FROM $inventory.groups \ No newline at end of file +SELECT * FROM $inventory.groups diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/02_0_data_modeling.md b/src/databricks/labs/ucx/queries/assessment/estimates/02_0_data_modeling.md index 11f5250a6b..e0b8dedb83 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/02_0_data_modeling.md +++ b/src/databricks/labs/ucx/queries/assessment/estimates/02_0_data_modeling.md @@ -1,10 +1,10 @@ -- widget title=Table estimates, row=2, col=0, size_x=2, size_y=8 ## 3 - UC Data modeling -The third step of a successful UC migration is defining your target data model on UC. +The third step of a successful UC migration is defining your target data model on UC. This step is required in order to choose in which catalogs the existing data in Hive Metastore will land. -As a starting point, consider creating a catalog that has the same name as your workspace. +As a starting point, consider creating a catalog that has the same name as your workspace. For example, a table `database.table1` will land in the `workspace_name.database.table1` table. The complexity factor is relative to the number of databases and tables identified during the assessment. diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/02_2_uc_data_modeling.sql b/src/databricks/labs/ucx/queries/assessment/estimates/02_2_uc_data_modeling.sql index 63ba1aa20c..a8a4b76788 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/02_2_uc_data_modeling.sql +++ b/src/databricks/labs/ucx/queries/assessment/estimates/02_2_uc_data_modeling.sql @@ -1,3 +1,3 @@ -- viz type=table, name=Tables to migrate, columns=catalog,database,name,object_type,table_format,location,view_text,upgraded_to,storage_properties -- widget title=Tables to migrate, row=2, col=2, size_x=3, size_y=8 -select * from $inventory.tables; \ No newline at end of file +select * from $inventory.tables; diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/02_5_uc_data_modeling_complexity.sql b/src/databricks/labs/ucx/queries/assessment/estimates/02_5_uc_data_modeling_complexity.sql index 13b12530f6..14fbae8be3 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/02_5_uc_data_modeling_complexity.sql +++ b/src/databricks/labs/ucx/queries/assessment/estimates/02_5_uc_data_modeling_complexity.sql @@ -6,4 +6,4 @@ when distinct_tables between 1 and 100 then "S" when distinct_tables between 101 and 300 then "M" when distinct_tables > 301 then "L" else NULL end as uc_model_complexity from -(select count(distinct concat(database,".",name)) as distinct_tables from $inventory.tables); \ No newline at end of file +(select count(distinct concat(database,".",name)) as distinct_tables from $inventory.tables); diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/03_0_data_migration.md b/src/databricks/labs/ucx/queries/assessment/estimates/03_0_data_migration.md index dfe9869eb7..9565ed94d3 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/03_0_data_migration.md +++ b/src/databricks/labs/ucx/queries/assessment/estimates/03_0_data_migration.md @@ -1,8 +1,8 @@ -- widget title=Table estimates, row=3, col=0, size_x=2, size_y=8 ## 4 - Data migration to UC -Once you have defined your data model in UC and that you've created appropriate Storage Credentials and External Locations, -you can then migrate your data to UC +Once you have defined your data model in UC and that you've created appropriate Storage Credentials and External Locations, +you can then migrate your data to UC Assumptions for a single table migration estimates: diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/03_2_data_migration_summary.sql b/src/databricks/labs/ucx/queries/assessment/estimates/03_2_data_migration_summary.sql index b0c1c886a9..0c4b1d012d 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/03_2_data_migration_summary.sql +++ b/src/databricks/labs/ucx/queries/assessment/estimates/03_2_data_migration_summary.sql @@ -1,3 +1,3 @@ -- viz type=table, name=Table estimates, columns=table_name,object_type,table_format,estimated_hours -- widget title=Table estimates, row=3, col=2, size_x=3, size_y=8 -SELECT * FROM $inventory.table_estimates \ No newline at end of file +SELECT * FROM $inventory.table_estimates diff --git a/src/databricks/labs/ucx/queries/assessment/estimates/03_5_data_migration_complexity.sql b/src/databricks/labs/ucx/queries/assessment/estimates/03_5_data_migration_complexity.sql index fc2f43bd1c..a2c46de2ce 100644 --- a/src/databricks/labs/ucx/queries/assessment/estimates/03_5_data_migration_complexity.sql +++ b/src/databricks/labs/ucx/queries/assessment/estimates/03_5_data_migration_complexity.sql @@ -8,4 +8,4 @@ CASE WHEN total_estimated_hours < 30 THEN "S" ELSE NULL END as data_migration_complexity FROM (SELECT sum(estimated_hours) AS total_estimated_hours -FROM $inventory.table_estimates) \ No newline at end of file +FROM $inventory.table_estimates) diff --git a/src/databricks/labs/ucx/queries/assessment/interactive/00_0_interactive.md b/src/databricks/labs/ucx/queries/assessment/interactive/00_0_interactive.md index dd4422372a..7b784805fc 100644 --- a/src/databricks/labs/ucx/queries/assessment/interactive/00_0_interactive.md +++ b/src/databricks/labs/ucx/queries/assessment/interactive/00_0_interactive.md @@ -16,4 +16,4 @@ To use this report: ### Compute Access Mode Limitation Summary -This widget will display a summary of the findings, the # workspaces, notebooks, clusters and users potentially impacted by [compute access mode limitations](https://docs.databricks.com/en/compute/access-mode-limitations.html#compute-access-mode-limitations) \ No newline at end of file +This widget will display a summary of the findings, the # workspaces, notebooks, clusters and users potentially impacted by [compute access mode limitations](https://docs.databricks.com/en/compute/access-mode-limitations.html#compute-access-mode-limitations) diff --git a/src/databricks/labs/ucx/queries/assessment/interactive/01_0_compute_access_mode_limitation_summary.sql b/src/databricks/labs/ucx/queries/assessment/interactive/01_0_compute_access_mode_limitation_summary.sql index 7a671bc0eb..0f5235f906 100644 --- a/src/databricks/labs/ucx/queries/assessment/interactive/01_0_compute_access_mode_limitation_summary.sql +++ b/src/databricks/labs/ucx/queries/assessment/interactive/01_0_compute_access_mode_limitation_summary.sql @@ -2,10 +2,10 @@ -- widget title=Compute Access Mode Limitation Summary, row=1, col=0, size_x=6, size_y=12 -- Scan notebook command history for potential paper cut issues -- https://docs.databricks.com/en/compute/access-mode-limitations.html#compute-access-mode-limitations --- -WITH +-- +WITH iteractive_cluster_commands ( - SELECT + SELECT a.event_id, a.request_params.notebookId AS notebook_id, a.request_params.clusterId AS cluster_id, @@ -18,7 +18,7 @@ iteractive_cluster_commands ( a.request_params.commandText, md5(a.request_params.commandText) commandHash FROM system.access.audit a - LEFT OUTER JOIN $inventory.clusters AS c + LEFT OUTER JOIN $inventory.clusters AS c ON a.request_params.clusterId = c.cluster_id AND a.action_name = 'runCommand' WHERE a.event_date >= DATE_SUB(CURRENT_DATE(), 90) @@ -31,23 +31,23 @@ pattern_matcher( array_except(array(p.issue, lp.issue, rv.issue,dbr_type.issue), array(null)) issues, a.* FROM iteractive_cluster_commands a - LEFT OUTER JOIN $inventory.code_patterns p + LEFT OUTER JOIN $inventory.code_patterns p ON a.commandLanguage in ('python','scala') AND contains(a.commandText, p.pattern) - LEFT OUTER JOIN misc_patterns lp + LEFT OUTER JOIN misc_patterns lp ON a.commandLanguage = lp.commandLanguage - LEFT OUTER JOIN misc_patterns rv -- runtime version + LEFT OUTER JOIN misc_patterns rv -- runtime version ON (a.commandLanguage = rv.commandLanguage OR rv.commandLanguage is null) AND a.dbr_version_major < rv.dbr_version_major AND rv.dbr_version_major is not null - LEFT OUTER JOIN misc_patterns dbr_type + LEFT OUTER JOIN misc_patterns dbr_type ON a.dbr_type = dbr_type.dbr_type and a.dbr_type in ('cpu','gpu') ), exp ( select distinct explode(issues) issue, workspace_id, notebook_id, cluster_id, email FROM pattern_matcher ) -SELECT +SELECT issue `Finding`, -- concat('',issue,'') as link, count(distinct workspace_id) `# workspaces`, @@ -56,4 +56,4 @@ SELECT count(distinct email) `# users` FROM exp group by 1 -order by 1 \ No newline at end of file +order by 1 diff --git a/src/databricks/labs/ucx/queries/assessment/interactive/02_0_cluster_summary.md b/src/databricks/labs/ucx/queries/assessment/interactive/02_0_cluster_summary.md index fc6ad0a60e..3fe2bbb439 100644 --- a/src/databricks/labs/ucx/queries/assessment/interactive/02_0_cluster_summary.md +++ b/src/databricks/labs/ucx/queries/assessment/interactive/02_0_cluster_summary.md @@ -12,4 +12,4 @@ Typical upgrade paths are: - For users with single node python ML requirements, Shared Compute with `%pip install` library support or Personal Compute with pools and compute controls may provide a better experience and better manageability. -- For single node ML users on a crowded driver node of a large shared cluster, will get a better experience with Personal Compute policies combined with (warm) Compute pools \ No newline at end of file +- For single node ML users on a crowded driver node of a large shared cluster, will get a better experience with Personal Compute policies combined with (warm) Compute pools diff --git a/src/databricks/labs/ucx/queries/assessment/interactive/03_0_cluster_summary.sql b/src/databricks/labs/ucx/queries/assessment/interactive/03_0_cluster_summary.sql index a46d89ccbe..14a588c4ca 100644 --- a/src/databricks/labs/ucx/queries/assessment/interactive/03_0_cluster_summary.sql +++ b/src/databricks/labs/ucx/queries/assessment/interactive/03_0_cluster_summary.sql @@ -1,9 +1,9 @@ -- viz type=table, name=Findings by Cluster, columns=distinct_findings,Commands,Users,First_command,Last_command,workspace_id,cluster_id,cluster_name,dbr_version,creator -- widget title=Findings by Cluster, row=3, col=0, size_x=6, size_y=12 --- -WITH +-- +WITH iteractive_cluster_commands ( - SELECT + SELECT a.event_id, a.request_params.notebookId AS notebook_id, a.request_params.clusterId AS cluster_id, @@ -32,16 +32,16 @@ pattern_matcher( explode(array_except(array(p.issue, lp.issue, rv.issue,dbr_type.issue), array(null))) issue, a.* FROM iteractive_cluster_commands a - LEFT OUTER JOIN $inventory.code_patterns p + LEFT OUTER JOIN $inventory.code_patterns p ON a.commandLanguage in ('python','scala') AND contains(a.commandText, p.pattern) - LEFT OUTER JOIN misc_patterns lp + LEFT OUTER JOIN misc_patterns lp ON a.commandLanguage = lp.commandLanguage - LEFT OUTER JOIN misc_patterns rv -- runtime version + LEFT OUTER JOIN misc_patterns rv -- runtime version ON (a.commandLanguage = rv.commandLanguage OR rv.commandLanguage is null) AND a.dbr_version_major < rv.dbr_version_major AND rv.dbr_version_major is not null - LEFT OUTER JOIN misc_patterns dbr_type + LEFT OUTER JOIN misc_patterns dbr_type ON a.dbr_type = dbr_type.dbr_type and a.dbr_type in ('cpu','gpu') ) SELECT @@ -58,4 +58,4 @@ SELECT FROM pattern_matcher GROUP BY ALL HAVING max(event_date) >= DATE_SUB(CURRENT_DATE(), 15) -- active in last N days -ORDER BY `Last command` desc, `First command` asc, coalesce(cluster_name,cluster_id) \ No newline at end of file +ORDER BY `Last command` desc, `First command` asc, coalesce(cluster_name,cluster_id) diff --git a/src/databricks/labs/ucx/queries/assessment/main/00_5_count_total_views.sql b/src/databricks/labs/ucx/queries/assessment/main/00_5_count_total_views.sql index bef2499c88..7d555f01ab 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/00_5_count_total_views.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/00_5_count_total_views.sql @@ -1,4 +1,4 @@ -- viz type=counter, name=Total View Count, counter_label=Total Views, value_column=count_total_views -- widget row=1, col=4, size_x=1, size_y=3 -SELECT count(*) AS count_total_views +SELECT count(*) AS count_total_views FROM $inventory.tables where object_type = 'VIEW' diff --git a/src/databricks/labs/ucx/queries/assessment/main/00___assessment_overview.md b/src/databricks/labs/ucx/queries/assessment/main/00___assessment_overview.md index 341f88148a..6f15f7fe81 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/00___assessment_overview.md +++ b/src/databricks/labs/ucx/queries/assessment/main/00___assessment_overview.md @@ -2,4 +2,4 @@ # Assessment Overview -[Quick link to dashboard documentation](https://github.com/databrickslabs/ucx/blob/main/docs/assessment.md) \ No newline at end of file +[Quick link to dashboard documentation](https://github.com/databrickslabs/ucx/blob/main/docs/assessment.md) diff --git a/src/databricks/labs/ucx/queries/assessment/main/01_0_count_jobs.sql b/src/databricks/labs/ucx/queries/assessment/main/01_0_count_jobs.sql index 796759ad7d..0caada5641 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/01_0_count_jobs.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/01_0_count_jobs.sql @@ -1,4 +1,4 @@ -- viz type=counter, name=Total Job Count, counter_label=Total Jobs, value_column=count_total_jobs -- widget row=2, col=0, size_x=2, size_y=5 -SELECT count(*) AS count_total_jobs -FROM $inventory.jobs WHERE job_name not like '[UCX]%' \ No newline at end of file +SELECT count(*) AS count_total_jobs +FROM $inventory.jobs WHERE job_name not like '[UCX]%' diff --git a/src/databricks/labs/ucx/queries/assessment/main/02_2_count_table_by_storage.sql b/src/databricks/labs/ucx/queries/assessment/main/02_2_count_table_by_storage.sql index 4e484bb3c1..ff417146cb 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/02_2_count_table_by_storage.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/02_2_count_table_by_storage.sql @@ -16,4 +16,4 @@ SELECT END AS storage FROM $inventory.tables) GROUP BY storage -ORDER BY storage; \ No newline at end of file +ORDER BY storage; diff --git a/src/databricks/labs/ucx/queries/assessment/main/05_0_object_readiness.sql b/src/databricks/labs/ucx/queries/assessment/main/05_0_object_readiness.sql index 3ad263f8d4..b8ea4b0318 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/05_0_object_readiness.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/05_0_object_readiness.sql @@ -1,10 +1,10 @@ -- viz type=table, name=Object Type Readiness, columns=object_type,readiness -- widget title=Readiness, row=7, col=0, size_x=2, size_y=8 WITH raw AS ( - SELECT object_type, object_id, IF(failures == '[]', 1, 0) AS ready + SELECT object_type, object_id, IF(failures == '[]', 1, 0) AS ready FROM $inventory.objects ) SELECT object_type, CONCAT(ROUND(SUM(ready) / COUNT(*) * 100, 1), '%') AS readiness FROM raw GROUP BY object_type -ORDER BY readiness DESC \ No newline at end of file +ORDER BY readiness DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/05_2_assessment_summary.sql b/src/databricks/labs/ucx/queries/assessment/main/05_2_assessment_summary.sql index 309f48cb7b..9c214adc98 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/05_2_assessment_summary.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/05_2_assessment_summary.sql @@ -4,7 +4,7 @@ WITH raw AS ( SELECT EXPLODE(FROM_JSON(failures, 'array')) AS finding FROM $inventory.objects WHERE failures <> '[]' ) -SELECT finding as `finding`, COUNT(*) AS count -FROM raw +SELECT finding as `finding`, COUNT(*) AS count +FROM raw GROUP BY finding ORDER BY count DESC, finding DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/10_0_all_udfs.sql b/src/databricks/labs/ucx/queries/assessment/main/10_0_all_udfs.sql index 5315bab00a..ebea97de30 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/10_0_all_udfs.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/10_0_all_udfs.sql @@ -1,3 +1,3 @@ -- viz type=table, name=UDF Summary, search_by=name, columns=catalog,database,name,func_type,func_input,func_returns,deterministic,data_access,body,comment -- widget title=UDF Summary, row=14, col=0, size_x=8, size_y=8 -SELECT * FROM $inventory.udfs \ No newline at end of file +SELECT * FROM $inventory.udfs diff --git a/src/databricks/labs/ucx/queries/assessment/main/10_0_database_summary.sql b/src/databricks/labs/ucx/queries/assessment/main/10_0_database_summary.sql index 0dee651d02..090d5158c5 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/10_0_database_summary.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/10_0_database_summary.sql @@ -50,4 +50,4 @@ WITH table_stats AS ( GROUP BY `database` ) SELECT * FROM database_stats FULL JOIN grant_stats USING (`database`) -ORDER BY tables DESC \ No newline at end of file +ORDER BY tables DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/15_3_mount_points.sql b/src/databricks/labs/ucx/queries/assessment/main/15_3_mount_points.sql index 31acd404d6..52faf6d75a 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/15_3_mount_points.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/15_3_mount_points.sql @@ -2,4 +2,4 @@ -- widget title=Mount Points, row=17, col=3, size_x=3, size_y=8 SELECT name, source -FROM $inventory.mounts \ No newline at end of file +FROM $inventory.mounts diff --git a/src/databricks/labs/ucx/queries/assessment/main/20_0_cluster_policies.sql b/src/databricks/labs/ucx/queries/assessment/main/20_0_cluster_policies.sql index a59ff7e70e..ed8daed749 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/20_0_cluster_policies.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/20_0_cluster_policies.sql @@ -6,4 +6,4 @@ SELECT policy.spark_version as policy_spark_version FROM $inventory.clusters as cluster JOIN $inventory.policies as policy -ON cluster.policy_id=policy.policy_id \ No newline at end of file +ON cluster.policy_id=policy.policy_id diff --git a/src/databricks/labs/ucx/queries/assessment/main/20_0_clusters.sql b/src/databricks/labs/ucx/queries/assessment/main/20_0_clusters.sql index fc7819e71d..1e2fc27bf0 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/20_0_clusters.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/20_0_clusters.sql @@ -6,4 +6,4 @@ SELECT EXPLODE(FROM_JSON(failures, 'array')) AS finding, creator FROM $inventory.clusters WHERE NOT STARTSWITH(cluster_name, "job-") -ORDER BY cluster_id DESC \ No newline at end of file +ORDER BY cluster_id DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/30_3_job_details.sql b/src/databricks/labs/ucx/queries/assessment/main/30_3_job_details.sql index 1ccadc2672..17c2d48b91 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/30_3_job_details.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/30_3_job_details.sql @@ -8,4 +8,4 @@ SELECT creator FROM $inventory.jobs WHERE job_name not like '[UCX]%' -ORDER BY job_id DESC \ No newline at end of file +ORDER BY job_id DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/30_3_jobs.sql b/src/databricks/labs/ucx/queries/assessment/main/30_3_jobs.sql index 43b8364b74..d39973eec5 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/30_3_jobs.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/30_3_jobs.sql @@ -7,4 +7,4 @@ SELECT creator FROM $inventory.jobs WHERE job_name not like '[UCX]%' -ORDER BY job_id DESC \ No newline at end of file +ORDER BY job_id DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/30_4_submit_runs.sql b/src/databricks/labs/ucx/queries/assessment/main/30_4_submit_runs.sql index c9b4f1de2a..96817307fd 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/30_4_submit_runs.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/30_4_submit_runs.sql @@ -5,4 +5,4 @@ SELECT EXPLODE(FROM_JSON(failures, 'array')) AS finding, FROM_JSON(run_ids, 'array') AS run_ids FROM $inventory.submit_runs -ORDER BY hashed_id DESC \ No newline at end of file +ORDER BY hashed_id DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/40_0_pipelines.sql b/src/databricks/labs/ucx/queries/assessment/main/40_0_pipelines.sql index 9d1e10273a..1e5c78b826 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/40_0_pipelines.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/40_0_pipelines.sql @@ -5,4 +5,4 @@ SELECT pipeline_name, creator_name FROM $inventory.pipelines -ORDER BY pipeline_name DESC \ No newline at end of file +ORDER BY pipeline_name DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/40_2_logs.sql b/src/databricks/labs/ucx/queries/assessment/main/40_2_logs.sql index 4683dc12af..e1ff5e594e 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/40_2_logs.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/40_2_logs.sql @@ -13,4 +13,4 @@ FROM $inventory.logs WHERE job_run_id = ( SELECT DISTINCT job_run_id FROM $inventory.logs WHERE timestamp = (SELECT MAX(timestamp) FROM $inventory.logs) ) -ORDER BY timestamp ASC \ No newline at end of file +ORDER BY timestamp ASC diff --git a/src/databricks/labs/ucx/queries/assessment/main/40_3_global_init_scripts.sql b/src/databricks/labs/ucx/queries/assessment/main/40_3_global_init_scripts.sql index a9a306b0fe..b1e9f79f74 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/40_3_global_init_scripts.sql +++ b/src/databricks/labs/ucx/queries/assessment/main/40_3_global_init_scripts.sql @@ -6,4 +6,4 @@ SELECT created_by FROM $inventory.global_init_scripts -ORDER BY script_name DESC \ No newline at end of file +ORDER BY script_name DESC diff --git a/src/databricks/labs/ucx/queries/assessment/main/README.md b/src/databricks/labs/ucx/queries/assessment/main/README.md index f83e1e03c4..2361d36f04 100644 --- a/src/databricks/labs/ucx/queries/assessment/main/README.md +++ b/src/databricks/labs/ucx/queries/assessment/main/README.md @@ -3,4 +3,4 @@ All files in this directory follow the virtual grid of a dashboard: * total width is 6 columns -* all files are named as `__something.sql` \ No newline at end of file +* all files are named as `__something.sql` diff --git a/src/databricks/labs/ucx/queries/migration/main/01_0_data_object_migration_status.md b/src/databricks/labs/ucx/queries/migration/main/01_0_data_object_migration_status.md index fa0352a2e6..60a441241b 100644 --- a/src/databricks/labs/ucx/queries/migration/main/01_0_data_object_migration_status.md +++ b/src/databricks/labs/ucx/queries/migration/main/01_0_data_object_migration_status.md @@ -2,11 +2,11 @@ ## Table migration status -The two widgets on the right show high-level summary of the table migration. The first widget shows the migration +The two widgets on the right show high-level summary of the table migration. The first widget shows the migration progress, and the second widget shows the data reconciliation results. -The table below assists with verifying if, how the tables are migrated and their correctness. It can be filtered on the +The table below assists with verifying if, how the tables are migrated and their correctness. It can be filtered on the table name and migration status. Next to table metadata, the table shows: - The table name before migrating - The migration status diff --git a/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_summary.sql b/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_summary.sql index 6c5d57798d..d92f61ae2e 100644 --- a/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_summary.sql +++ b/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_summary.sql @@ -13,4 +13,4 @@ FROM $inventory.tables AS tables LEFT JOIN $inventory.migration_status AS migration_status - ON tables.`database` = migration_status.src_schema AND tables.name = migration_status.src_table \ No newline at end of file + ON tables.`database` = migration_status.src_schema AND tables.name = migration_status.src_table diff --git a/src/databricks/labs/ucx/queries/migration/main/02_1_code_compatibility_problems.sql b/src/databricks/labs/ucx/queries/migration/main/02_1_code_compatibility_problems.sql index 70994baca7..02a5d18076 100644 --- a/src/databricks/labs/ucx/queries/migration/main/02_1_code_compatibility_problems.sql +++ b/src/databricks/labs/ucx/queries/migration/main/02_1_code_compatibility_problems.sql @@ -11,4 +11,4 @@ SELECT start_col, end_line, end_col -FROM $inventory.workflow_problems \ No newline at end of file +FROM $inventory.workflow_problems diff --git a/src/databricks/labs/ucx/queries/migration/main/02_1_data_reconciliation_summary.sql b/src/databricks/labs/ucx/queries/migration/main/02_1_data_reconciliation_summary.sql index a0acfd69aa..57ebfdc528 100644 --- a/src/databricks/labs/ucx/queries/migration/main/02_1_data_reconciliation_summary.sql +++ b/src/databricks/labs/ucx/queries/migration/main/02_1_data_reconciliation_summary.sql @@ -11,4 +11,4 @@ SELECT count(*) AS total, concat(round(success / total * 100, 2), '%') AS success_rate FROM - $inventory.reconciliation_results \ No newline at end of file + $inventory.reconciliation_results diff --git a/src/databricks/labs/ucx/queries/migration/main/03_1_data_reconciliation_status.sql b/src/databricks/labs/ucx/queries/migration/main/03_1_data_reconciliation_status.sql index 34e68e252a..aebe8dcacd 100644 --- a/src/databricks/labs/ucx/queries/migration/main/03_1_data_reconciliation_status.sql +++ b/src/databricks/labs/ucx/queries/migration/main/03_1_data_reconciliation_status.sql @@ -28,4 +28,4 @@ FROM ON tables.`database` = migration_status.src_schema AND tables.name = migration_status.src_table LEFT JOIN $inventory.reconciliation_results AS reconciliation_results - ON tables.`database` = reconciliation_results.src_schema AND tables.name = reconciliation_results.src_table \ No newline at end of file + ON tables.`database` = reconciliation_results.src_schema AND tables.name = reconciliation_results.src_table diff --git a/src/databricks/labs/ucx/queries/views/code_patterns.sql b/src/databricks/labs/ucx/queries/views/code_patterns.sql index 2baf021f0e..64fd69c3f5 100644 --- a/src/databricks/labs/ucx/queries/views/code_patterns.sql +++ b/src/databricks/labs/ucx/queries/views/code_patterns.sql @@ -9,22 +9,22 @@ SELECT col1 AS pattern, col2 AS issue FROM values ('._jvm', 'AF302.5 - Arbitrary Java'), ('._jvm.org.apache.log4j', 'AF302.6 - Arbitrary Java'), ('spark.udf.registerJavaFunction', 'AF303.1 - Java UDF'), - + ('spark.read.format("jdbc")', 'AF304.1 - JDBC datasource'), ('boto3', 'AF305.1 - boto3'), ('s3fs', 'AF305.2 - s3fs'), ('dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()', 'AF306.1 - dbutils...getContext'), ('dbutils.notebook.entry_point.getDbutils().notebook().getContext()', 'AF306.2 - dbutils...getContext'), - + ('dbutils.credentials.', 'AF310.1 - credential passthrough'), ('dbutils.fs.', 'AF311.1 - dbutils.fs'), ('dbutils.fs.mount', 'AF311.2 - dbutils mount(s)'), ('dbutils.fs.refreshMounts', 'AF311.3 - dbutils mount(s)'), ('dbutils.fs.unmount', 'AF311.4 - dbutils mount(s)'), - ('dbfs:/mnt', 'AF311.5 - mount points'), - ('dbfs:/', 'AF311.6 - dbfs usage'), - ('/dbfs/', 'AF311.7 - dbfs usage'), + ('dbfs:/mnt', 'AF311.5 - mount points'), + ('dbfs:/', 'AF311.6 - dbfs usage'), + ('/dbfs/', 'AF311.7 - dbfs usage'), ('spark.sparkContext', 'AF313.1 - SparkContext'), @@ -54,7 +54,7 @@ SELECT col1 AS pattern, col2 AS issue FROM values ('.union', 'AF313.24 - SparkContext'), ('.wholeTextFiles', 'AF313.25 - SparkContext'), - ('sparknlp', 'AF314.1 - Distributed ML'), + ('sparknlp', 'AF314.1 - Distributed ML'), ('xgboost.spark', 'AF314.2 - Distributed ML'), ('catboost_spark', 'AF314.3 - Distributed ML'), ('ai.catboost:catboost-spark', 'AF314.4 - Distributed ML'), @@ -69,7 +69,7 @@ SELECT col1 AS pattern, col2 AS issue FROM values ('UserDefinedAggregateFunction', 'AF315.1 - UDAF scala issue'), ('applyInPandas', 'AF315.2 - applyInPandas'), ('mapInPandas', 'AF315.3 - mapInPandas'), - + ('.trigger(continuous', 'AF330.1 - Streaming'), ('kafka.sasl.client.callback.handler.class', 'AF330.2 - Streaming'), @@ -83,4 +83,4 @@ SELECT col1 AS pattern, col2 AS issue FROM values ('applyInPandasWithState', 'AF330.10 - Streaming'), ('.format("socket")', 'AF330.11 - Streaming'), ('StreamingQueryListener', 'AF330.12 - Streaming'), - ('applyInPandasWithState', 'AF330.13 - Streaming') \ No newline at end of file + ('applyInPandasWithState', 'AF330.13 - Streaming') diff --git a/src/databricks/labs/ucx/queries/views/misc_patterns.sql b/src/databricks/labs/ucx/queries/views/misc_patterns.sql index 7a3d8f8a80..9f91f1203e 100644 --- a/src/databricks/labs/ucx/queries/views/misc_patterns.sql +++ b/src/databricks/labs/ucx/queries/views/misc_patterns.sql @@ -1,6 +1,6 @@ -SELECT +SELECT col1 AS commandLanguage, -- r, scala, python, sql - col2 as dbr_version_major, -- INT + col2 as dbr_version_major, -- INT col3 as dbr_version_minor, -- INT col4 as dbr_type, -- STRING col5 AS pattern, -- expansion / compatibility with code patterns @@ -10,4 +10,4 @@ FROM VALUES ('scala', 13, 3, null, null, 'AF300.3 - Scala Language support'), (null, 11, 3, null, null, 'AF300.4 - Minimum DBR version'), (null, null, null, 'cpu', null, 'AF300.5 - ML Runtime cpu'), - (null, null, null, 'gpu', null, 'AF300.6 - ML Runtime gpu') \ No newline at end of file + (null, null, null, 'gpu', null, 'AF300.6 - ML Runtime gpu') diff --git a/src/databricks/labs/ucx/queries/views/reconciliation_results.sql b/src/databricks/labs/ucx/queries/views/reconciliation_results.sql index ef60632235..d54c6d3838 100644 --- a/src/databricks/labs/ucx/queries/views/reconciliation_results.sql +++ b/src/databricks/labs/ucx/queries/views/reconciliation_results.sql @@ -27,4 +27,4 @@ SELECT schema_comparison_result.data AS column_comparison, error_message FROM - flattened \ No newline at end of file + flattened diff --git a/src/databricks/labs/ucx/queries/views/table_estimates.sql b/src/databricks/labs/ucx/queries/views/table_estimates.sql index 3139020962..3903aa794c 100644 --- a/src/databricks/labs/ucx/queries/views/table_estimates.sql +++ b/src/databricks/labs/ucx/queries/views/table_estimates.sql @@ -10,4 +10,4 @@ when object_type == "EXTERNAL" and table_format != "DELTA" then 1 -- Can vary de when object_type == "VIEW" then 2 -- Can vary depending of view complexity and number of tables used in the view else NULL end as estimated_hours from $inventory.tables -where not startswith(name, "__apply_changes") \ No newline at end of file +where not startswith(name, "__apply_changes") diff --git a/src/databricks/labs/ucx/recon/data_comparator.py b/src/databricks/labs/ucx/recon/data_comparator.py index 7195058280..37e1beaa41 100644 --- a/src/databricks/labs/ucx/recon/data_comparator.py +++ b/src/databricks/labs/ucx/recon/data_comparator.py @@ -15,18 +15,18 @@ class StandardDataComparator(DataComparator): _DATA_COMPARISON_QUERY_TEMPLATE = """ WITH compare_results AS ( - SELECT - CASE + SELECT + CASE WHEN source.hash_value IS NULL AND target.hash_value IS NULL THEN TRUE WHEN source.hash_value IS NULL OR target.hash_value IS NULL THEN FALSE WHEN source.hash_value = target.hash_value THEN TRUE ELSE FALSE END AS is_match, - CASE + CASE WHEN target.hash_value IS NULL THEN 1 ELSE 0 END AS target_missing_count, - CASE + CASE WHEN source.hash_value IS NULL THEN 1 ELSE 0 END AS source_missing_count @@ -40,7 +40,7 @@ class StandardDataComparator(DataComparator): ) AS target ON source.hash_value = target.hash_value ) - SELECT + SELECT COUNT(*) AS total_mismatches, COALESCE(SUM(target_missing_count), 0) AS target_missing_count, COALESCE(SUM(source_missing_count), 0) AS source_missing_count diff --git a/src/databricks/labs/ucx/recon/metadata_retriever.py b/src/databricks/labs/ucx/recon/metadata_retriever.py index f3b3cfeb95..5e4a86b03a 100644 --- a/src/databricks/labs/ucx/recon/metadata_retriever.py +++ b/src/databricks/labs/ucx/recon/metadata_retriever.py @@ -37,14 +37,14 @@ def _build_metadata_query(cls, entity: TableIdentifier) -> str: return f"DESCRIBE TABLE {entity.fqn_escaped}" query = f""" - SELECT - LOWER(column_name) AS col_name, + SELECT + LOWER(column_name) AS col_name, full_data_type AS data_type - FROM + FROM {entity.catalog_escaped}.information_schema.columns WHERE - LOWER(table_catalog)='{entity.catalog}' AND - LOWER(table_schema)='{entity.schema}' AND + LOWER(table_catalog)='{entity.catalog}' AND + LOWER(table_schema)='{entity.schema}' AND LOWER(table_name) ='{entity.table}' ORDER BY col_name""" diff --git a/tests/unit/assessment/clusters/azure-spn-secret-interactive-multiple-spn.json b/tests/unit/assessment/clusters/azure-spn-secret-interactive-multiple-spn.json index fc0bdc4995..472e715a1d 100644 --- a/tests/unit/assessment/clusters/azure-spn-secret-interactive-multiple-spn.json +++ b/tests/unit/assessment/clusters/azure-spn-secret-interactive-multiple-spn.json @@ -17,4 +17,4 @@ }, "spark_context_id": 5134472582179565315, "spark_version": "13.3.x-cpu-ml-scala2.12" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/azure-spn-secret.json b/tests/unit/assessment/clusters/azure-spn-secret.json index 25f180c99f..941c72f111 100644 --- a/tests/unit/assessment/clusters/azure-spn-secret.json +++ b/tests/unit/assessment/clusters/azure-spn-secret.json @@ -13,4 +13,4 @@ }, "spark_context_id": 5134472582179565315, "spark_version": "13.3.x-cpu-ml-scala2.12" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/init-scripts-dbfs.json b/tests/unit/assessment/clusters/init-scripts-dbfs.json index 0012714caa..0dfb769362 100644 --- a/tests/unit/assessment/clusters/init-scripts-dbfs.json +++ b/tests/unit/assessment/clusters/init-scripts-dbfs.json @@ -29,4 +29,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/init-scripts-file.json b/tests/unit/assessment/clusters/init-scripts-file.json index 13c5ff631c..75769b1e57 100644 --- a/tests/unit/assessment/clusters/init-scripts-file.json +++ b/tests/unit/assessment/clusters/init-scripts-file.json @@ -24,4 +24,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/init-scripts-no-match.json b/tests/unit/assessment/clusters/init-scripts-no-match.json index 723312509f..f85a68df59 100644 --- a/tests/unit/assessment/clusters/init-scripts-no-match.json +++ b/tests/unit/assessment/clusters/init-scripts-no-match.json @@ -14,4 +14,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/job-cluster.json b/tests/unit/assessment/clusters/job-cluster.json index 3a7445e03a..199f7e21a9 100644 --- a/tests/unit/assessment/clusters/job-cluster.json +++ b/tests/unit/assessment/clusters/job-cluster.json @@ -13,4 +13,4 @@ "spark.databricks.delta.preview.enabled": "true" }, "spark_context_id":"5134472582179565315" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/legacy-passthrough.json b/tests/unit/assessment/clusters/legacy-passthrough.json index ce3478e956..a243f29845 100644 --- a/tests/unit/assessment/clusters/legacy-passthrough.json +++ b/tests/unit/assessment/clusters/legacy-passthrough.json @@ -2,4 +2,4 @@ "cluster_name": "Passthrough cluster", "spark_version": "12.3.x-cpu-ml-scala2.12", "data_security_mode": "LEGACY_PASSTHROUGH" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/no-isolation.json b/tests/unit/assessment/clusters/no-isolation.json index 8e549821f3..ae95fcf868 100644 --- a/tests/unit/assessment/clusters/no-isolation.json +++ b/tests/unit/assessment/clusters/no-isolation.json @@ -2,4 +2,4 @@ "cluster_name": "No isolation shared", "spark_version": "12.3.x-cpu-ml-scala2.12", "data_security_mode": "NONE" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/outdated-autoscale.json b/tests/unit/assessment/clusters/outdated-autoscale.json index 2fe7ff61da..546d2dbd25 100644 --- a/tests/unit/assessment/clusters/outdated-autoscale.json +++ b/tests/unit/assessment/clusters/outdated-autoscale.json @@ -6,4 +6,4 @@ "cluster_id": "outdated-autoscale", "cluster_name": "Outdated Shared Autoscale", "spark_version": "9.3.x-cpu-ml-scala2.12" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/passthrough.json b/tests/unit/assessment/clusters/passthrough.json index baf5ace1ce..396d7d7dc3 100644 --- a/tests/unit/assessment/clusters/passthrough.json +++ b/tests/unit/assessment/clusters/passthrough.json @@ -10,4 +10,4 @@ "spark_conf" : { "spark.databricks.passthrough.enabled": "True" } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/policy-azure-oauth.json b/tests/unit/assessment/clusters/policy-azure-oauth.json index 1713f36c4e..26bf3f5b46 100644 --- a/tests/unit/assessment/clusters/policy-azure-oauth.json +++ b/tests/unit/assessment/clusters/policy-azure-oauth.json @@ -12,4 +12,4 @@ "spark.databricks.delta.preview.enabled": "true" }, "spark_context_id": "5134472582179565315" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/policy-deleted.json b/tests/unit/assessment/clusters/policy-deleted.json index 389a7cf7da..4dd0336127 100644 --- a/tests/unit/assessment/clusters/policy-deleted.json +++ b/tests/unit/assessment/clusters/policy-deleted.json @@ -12,4 +12,4 @@ "spark.databricks.delta.preview.enabled": "true" }, "spark_context_id": "5134472582179565315" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/policy-single-user-with-empty-appid-spn.json b/tests/unit/assessment/clusters/policy-single-user-with-empty-appid-spn.json index d168b63459..3d9cd56ed8 100644 --- a/tests/unit/assessment/clusters/policy-single-user-with-empty-appid-spn.json +++ b/tests/unit/assessment/clusters/policy-single-user-with-empty-appid-spn.json @@ -9,4 +9,4 @@ "cluster_id": "0810-225833-atlanta69", "cluster_name": "Tech Summit FY24 Cluster-1", "policy_id": "single-user-with-empty-appid-spn" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/policy-single-user-with-spn.json b/tests/unit/assessment/clusters/policy-single-user-with-spn.json index 496703da49..9aca28d2b8 100644 --- a/tests/unit/assessment/clusters/policy-single-user-with-spn.json +++ b/tests/unit/assessment/clusters/policy-single-user-with-spn.json @@ -12,4 +12,4 @@ "spark.databricks.delta.preview.enabled": "true" }, "spark_context_id": "5134472582179565315" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/policy-spn-in-policy-overrides.json b/tests/unit/assessment/clusters/policy-spn-in-policy-overrides.json index 8c475b629f..cf34e3a166 100644 --- a/tests/unit/assessment/clusters/policy-spn-in-policy-overrides.json +++ b/tests/unit/assessment/clusters/policy-spn-in-policy-overrides.json @@ -9,4 +9,4 @@ "cluster_id": "0810-225833-atlanta69", "cluster_name": "Tech Summit FY24 Cluster-1", "policy_id": "spn-in-policy-overrides" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/clusters/simplest-autoscale.json b/tests/unit/assessment/clusters/simplest-autoscale.json index 7b36a9552b..34d583bb0f 100644 --- a/tests/unit/assessment/clusters/simplest-autoscale.json +++ b/tests/unit/assessment/clusters/simplest-autoscale.json @@ -7,4 +7,4 @@ "cluster_name": "Simplest Shared Autoscale", "policy_id": "single-user-with-spn", "spark_version": "13.3.x-cpu-ml-scala2.12" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/dbt_task.json b/tests/unit/assessment/jobruns/dbt_task.json index 4c2d32ce3b..38dcd46e25 100644 --- a/tests/unit/assessment/jobruns/dbt_task.json +++ b/tests/unit/assessment/jobruns/dbt_task.json @@ -20,4 +20,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/gitsource_task.json b/tests/unit/assessment/jobruns/gitsource_task.json index cd7179b4c6..3d7d32d1d0 100644 --- a/tests/unit/assessment/jobruns/gitsource_task.json +++ b/tests/unit/assessment/jobruns/gitsource_task.json @@ -17,4 +17,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/jar_task.json b/tests/unit/assessment/jobruns/jar_task.json index 9db65106be..3220668898 100644 --- a/tests/unit/assessment/jobruns/jar_task.json +++ b/tests/unit/assessment/jobruns/jar_task.json @@ -13,4 +13,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/notebook_dupe_task.json b/tests/unit/assessment/jobruns/notebook_dupe_task.json index 45a5bc57d4..eba5311f2f 100644 --- a/tests/unit/assessment/jobruns/notebook_dupe_task.json +++ b/tests/unit/assessment/jobruns/notebook_dupe_task.json @@ -12,4 +12,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/notebook_no_failure_task.json b/tests/unit/assessment/jobruns/notebook_no_failure_task.json index 9ea4bad124..30826280dc 100644 --- a/tests/unit/assessment/jobruns/notebook_no_failure_task.json +++ b/tests/unit/assessment/jobruns/notebook_no_failure_task.json @@ -12,4 +12,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/notebook_no_sec_comp_task.json b/tests/unit/assessment/jobruns/notebook_no_sec_comp_task.json index b502cc27f2..148229f3f7 100644 --- a/tests/unit/assessment/jobruns/notebook_no_sec_comp_task.json +++ b/tests/unit/assessment/jobruns/notebook_no_sec_comp_task.json @@ -18,4 +18,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/notebook_no_sec_no_comp_task.json b/tests/unit/assessment/jobruns/notebook_no_sec_no_comp_task.json index 495a8c25bc..1ad58ac695 100644 --- a/tests/unit/assessment/jobruns/notebook_no_sec_no_comp_task.json +++ b/tests/unit/assessment/jobruns/notebook_no_sec_no_comp_task.json @@ -18,4 +18,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/notebook_spark_conf_task.json b/tests/unit/assessment/jobruns/notebook_spark_conf_task.json index 30b49fd50b..024c0e574f 100644 --- a/tests/unit/assessment/jobruns/notebook_spark_conf_task.json +++ b/tests/unit/assessment/jobruns/notebook_spark_conf_task.json @@ -22,4 +22,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/notebook_task.json b/tests/unit/assessment/jobruns/notebook_task.json index b1f4bf0f29..471453f337 100644 --- a/tests/unit/assessment/jobruns/notebook_task.json +++ b/tests/unit/assessment/jobruns/notebook_task.json @@ -12,4 +12,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/python_wheel_task.json b/tests/unit/assessment/jobruns/python_wheel_task.json index 290ea45c09..081030c714 100644 --- a/tests/unit/assessment/jobruns/python_wheel_task.json +++ b/tests/unit/assessment/jobruns/python_wheel_task.json @@ -13,4 +13,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/run_condition_task.json b/tests/unit/assessment/jobruns/run_condition_task.json index d6ecc5fa25..75cb398d20 100644 --- a/tests/unit/assessment/jobruns/run_condition_task.json +++ b/tests/unit/assessment/jobruns/run_condition_task.json @@ -15,4 +15,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/spark_jar_task.json b/tests/unit/assessment/jobruns/spark_jar_task.json index 8166878d81..b25cbc08a3 100644 --- a/tests/unit/assessment/jobruns/spark_jar_task.json +++ b/tests/unit/assessment/jobruns/spark_jar_task.json @@ -13,4 +13,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobruns/sql_tasks.json b/tests/unit/assessment/jobruns/sql_tasks.json index c3d8fbc816..3f9e0ff57d 100644 --- a/tests/unit/assessment/jobruns/sql_tasks.json +++ b/tests/unit/assessment/jobruns/sql_tasks.json @@ -45,4 +45,4 @@ } } ] -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/legacy-job-on-azure-spn-secret.json b/tests/unit/assessment/jobs/legacy-job-on-azure-spn-secret.json index 8997a788bc..118979d83f 100644 --- a/tests/unit/assessment/jobs/legacy-job-on-azure-spn-secret.json +++ b/tests/unit/assessment/jobs/legacy-job-on-azure-spn-secret.json @@ -9,4 +9,4 @@ "notebook_path": "/Users/foo.bar@databricks.com/Customers/Example/Test/Load" } } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/no-settings.json b/tests/unit/assessment/jobs/no-settings.json index 366c51ebef..ccee446571 100644 --- a/tests/unit/assessment/jobs/no-settings.json +++ b/tests/unit/assessment/jobs/no-settings.json @@ -1,3 +1,3 @@ { "job_id": 9002 -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/no-tasks.json b/tests/unit/assessment/jobs/no-tasks.json index 04c6da54fa..0615479de1 100644 --- a/tests/unit/assessment/jobs/no-tasks.json +++ b/tests/unit/assessment/jobs/no-tasks.json @@ -3,4 +3,4 @@ "settings": { "name": "No Tasks" } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/on-azure-spn-secret.json b/tests/unit/assessment/jobs/on-azure-spn-secret.json index 85b749f377..d79535c716 100644 --- a/tests/unit/assessment/jobs/on-azure-spn-secret.json +++ b/tests/unit/assessment/jobs/on-azure-spn-secret.json @@ -14,4 +14,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/on-outdated-autoscale.json b/tests/unit/assessment/jobs/on-outdated-autoscale.json index 09c1418e33..b2e3a45b16 100644 --- a/tests/unit/assessment/jobs/on-outdated-autoscale.json +++ b/tests/unit/assessment/jobs/on-outdated-autoscale.json @@ -14,4 +14,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/on-simplest-autoscale.json b/tests/unit/assessment/jobs/on-simplest-autoscale.json index 830af2f1cb..84c8849b64 100644 --- a/tests/unit/assessment/jobs/on-simplest-autoscale.json +++ b/tests/unit/assessment/jobs/on-simplest-autoscale.json @@ -14,4 +14,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/policy-single-job-with-spn.json b/tests/unit/assessment/jobs/policy-single-job-with-spn.json index b5eef5c653..5f408544ac 100644 --- a/tests/unit/assessment/jobs/policy-single-job-with-spn.json +++ b/tests/unit/assessment/jobs/policy-single-job-with-spn.json @@ -22,4 +22,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/single-job.json b/tests/unit/assessment/jobs/single-job.json index d7a599536c..4d37073542 100644 --- a/tests/unit/assessment/jobs/single-job.json +++ b/tests/unit/assessment/jobs/single-job.json @@ -13,4 +13,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/some-spn.json b/tests/unit/assessment/jobs/some-spn.json index f36d2a486c..8085c279a6 100644 --- a/tests/unit/assessment/jobs/some-spn.json +++ b/tests/unit/assessment/jobs/some-spn.json @@ -25,4 +25,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/jobs/spark-jar-task.json b/tests/unit/assessment/jobs/spark-jar-task.json index df70ea0420..163e89114c 100644 --- a/tests/unit/assessment/jobs/spark-jar-task.json +++ b/tests/unit/assessment/jobs/spark-jar-task.json @@ -15,4 +15,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/pipelines/empty-spec.json b/tests/unit/assessment/pipelines/empty-spec.json index 42956973fe..5f5312dbe3 100644 --- a/tests/unit/assessment/pipelines/empty-spec.json +++ b/tests/unit/assessment/pipelines/empty-spec.json @@ -6,4 +6,4 @@ "configuration": { } } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/pipelines/spec-with-spn.json b/tests/unit/assessment/pipelines/spec-with-spn.json index a668ce7bd2..320b9229f9 100644 --- a/tests/unit/assessment/pipelines/spec-with-spn.json +++ b/tests/unit/assessment/pipelines/spec-with-spn.json @@ -49,4 +49,4 @@ "spark.hadoop.fs.azure.sas.fixed.token.abcde.dfs.core.windows.net": "{{secrets/abcde_access/sasFixedToken}}" } } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/policies/ext-hms.json b/tests/unit/assessment/policies/ext-hms.json index 28579017a8..ccea72a477 100644 --- a/tests/unit/assessment/policies/ext-hms.json +++ b/tests/unit/assessment/policies/ext-hms.json @@ -33,4 +33,4 @@ }, "policy_family_definition_overrides": {}, "name": "ext_hms" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/policies/single-job-with-spn.json b/tests/unit/assessment/policies/single-job-with-spn.json index 0ce429360c..11d34533f4 100644 --- a/tests/unit/assessment/policies/single-job-with-spn.json +++ b/tests/unit/assessment/policies/single-job-with-spn.json @@ -27,4 +27,4 @@ } }, "policy_family_definition_overrides": {} -} \ No newline at end of file +} diff --git a/tests/unit/assessment/policies/single-user-with-empty-appid-spn.json b/tests/unit/assessment/policies/single-user-with-empty-appid-spn.json index dabe6d4bc0..42e453eed6 100644 --- a/tests/unit/assessment/policies/single-user-with-empty-appid-spn.json +++ b/tests/unit/assessment/policies/single-user-with-empty-appid-spn.json @@ -28,4 +28,4 @@ }, "policy_family_definition_overrides": { } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/policies/single-user-with-spn-no-sparkversion.json b/tests/unit/assessment/policies/single-user-with-spn-no-sparkversion.json index fd94845044..ae4f4640d1 100644 --- a/tests/unit/assessment/policies/single-user-with-spn-no-sparkversion.json +++ b/tests/unit/assessment/policies/single-user-with-spn-no-sparkversion.json @@ -10,4 +10,4 @@ }, "policy_family_definition_overrides": { } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/policies/single-user-with-spn-policyid.json b/tests/unit/assessment/policies/single-user-with-spn-policyid.json index e152cc3325..dfb967b1b1 100644 --- a/tests/unit/assessment/policies/single-user-with-spn-policyid.json +++ b/tests/unit/assessment/policies/single-user-with-spn-policyid.json @@ -31,4 +31,4 @@ "name": "test_policy", "description": "test", "creator_user_name": "test_creator" -} \ No newline at end of file +} diff --git a/tests/unit/assessment/policies/single-user-with-spn.json b/tests/unit/assessment/policies/single-user-with-spn.json index 37b6726233..6f81a8403c 100644 --- a/tests/unit/assessment/policies/single-user-with-spn.json +++ b/tests/unit/assessment/policies/single-user-with-spn.json @@ -79,4 +79,4 @@ "hidden": true } } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/policies/spn-in-policy-overrides.json b/tests/unit/assessment/policies/spn-in-policy-overrides.json index 89dfb89f0e..8f5fce4fd1 100644 --- a/tests/unit/assessment/policies/spn-in-policy-overrides.json +++ b/tests/unit/assessment/policies/spn-in-policy-overrides.json @@ -27,4 +27,4 @@ "hidden": "true" } } -} \ No newline at end of file +} diff --git a/tests/unit/assessment/warehouses/dupe-spn-config.json b/tests/unit/assessment/warehouses/dupe-spn-config.json index 0c0bac552a..4c02199df9 100644 --- a/tests/unit/assessment/warehouses/dupe-spn-config.json +++ b/tests/unit/assessment/warehouses/dupe-spn-config.json @@ -39,4 +39,4 @@ "key": "spark.hadoop.fs.azure.account.oauth2.client.endpoint.newstorageacct.dfs.core.windows.net", "value": "https://login.microsoftonline.com/directory_12345/oauth2/token" } -] \ No newline at end of file +] diff --git a/tests/unit/assessment/warehouses/single-config.json b/tests/unit/assessment/warehouses/single-config.json index 1d10bc5981..5dbf11fafc 100644 --- a/tests/unit/assessment/warehouses/single-config.json +++ b/tests/unit/assessment/warehouses/single-config.json @@ -3,4 +3,4 @@ "key": "spark.hadoop.fs.azure.account.auth.type.storage_acct1.dfs.core.windows.net", "value": "OAuth" } -] \ No newline at end of file +] diff --git a/tests/unit/assessment/warehouses/spn-config.json b/tests/unit/assessment/warehouses/spn-config.json index fee278f6bc..0c73f970bd 100644 --- a/tests/unit/assessment/warehouses/spn-config.json +++ b/tests/unit/assessment/warehouses/spn-config.json @@ -39,4 +39,4 @@ "key": "spark.hadoop.fs.azure.account.oauth2.client.endpoint.storage_acct1.dfs.core.windows.net", "value": "https://login.microsoftonline.com/dummy_tenant_id_2/oauth2/token" } -] \ No newline at end of file +] diff --git a/tests/unit/assessment/warehouses/spn-secret-config.json b/tests/unit/assessment/warehouses/spn-secret-config.json index 0154585c19..7045af8697 100644 --- a/tests/unit/assessment/warehouses/spn-secret-config.json +++ b/tests/unit/assessment/warehouses/spn-secret-config.json @@ -39,4 +39,4 @@ "key": "spark.hadoop.fs.azure.account.oauth2.client.endpoint.xyz.dfs.core.windows.net", "value": "https://login.microsoftonline.com/dummy_tenant_id2/oauth2/token" } -] \ No newline at end of file +] diff --git a/tests/unit/azure/azure/mappings.json b/tests/unit/azure/azure/mappings.json index 73141e7795..dc1e0e41ca 100644 --- a/tests/unit/azure/azure/mappings.json +++ b/tests/unit/azure/azure/mappings.json @@ -418,4 +418,4 @@ } } } -] \ No newline at end of file +] diff --git a/tests/unit/hive_metastore/tables/dbfs_parquet.json b/tests/unit/hive_metastore/tables/dbfs_parquet.json index a787ffa363..559eb1726b 100644 --- a/tests/unit/hive_metastore/tables/dbfs_parquet.json +++ b/tests/unit/hive_metastore/tables/dbfs_parquet.json @@ -15,4 +15,4 @@ "src_table": "managed_dbfs", "dst_table": "managed_dbfs" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/external_hiveserde.json b/tests/unit/hive_metastore/tables/external_hiveserde.json index 58b67468e0..f690fefd45 100644 --- a/tests/unit/hive_metastore/tables/external_hiveserde.json +++ b/tests/unit/hive_metastore/tables/external_hiveserde.json @@ -15,4 +15,4 @@ "src_table": "external_src", "dst_table": "external_dst" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/external_no_sync.json b/tests/unit/hive_metastore/tables/external_no_sync.json index 73e4c5fecc..3c8cc3f002 100644 --- a/tests/unit/hive_metastore/tables/external_no_sync.json +++ b/tests/unit/hive_metastore/tables/external_no_sync.json @@ -15,4 +15,4 @@ "src_table": "external_src", "dst_table": "external_dst" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/external_no_sync_missing_location.json b/tests/unit/hive_metastore/tables/external_no_sync_missing_location.json index 71f3c64ee7..2df0fe6ae4 100644 --- a/tests/unit/hive_metastore/tables/external_no_sync_missing_location.json +++ b/tests/unit/hive_metastore/tables/external_no_sync_missing_location.json @@ -14,4 +14,4 @@ "src_table": "external_src", "dst_table": "external_dst" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/external_src.json b/tests/unit/hive_metastore/tables/external_src.json index 46c29a6999..7b1c2f79e3 100644 --- a/tests/unit/hive_metastore/tables/external_src.json +++ b/tests/unit/hive_metastore/tables/external_src.json @@ -14,4 +14,4 @@ "src_table": "external_src", "dst_table": "external_dst" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/external_src_unsupported.json b/tests/unit/hive_metastore/tables/external_src_unsupported.json index 32b6cbbe43..4510a984d9 100644 --- a/tests/unit/hive_metastore/tables/external_src_unsupported.json +++ b/tests/unit/hive_metastore/tables/external_src_unsupported.json @@ -14,4 +14,4 @@ "src_table": "external_src", "dst_table": "external_dst" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/managed_dbfs.json b/tests/unit/hive_metastore/tables/managed_dbfs.json index 5b9bb84b90..1bac3c396a 100644 --- a/tests/unit/hive_metastore/tables/managed_dbfs.json +++ b/tests/unit/hive_metastore/tables/managed_dbfs.json @@ -15,4 +15,4 @@ "src_table": "managed_dbfs", "dst_table": "managed_dbfs" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/managed_mnt.json b/tests/unit/hive_metastore/tables/managed_mnt.json index d7b96db341..cc93561cc8 100644 --- a/tests/unit/hive_metastore/tables/managed_mnt.json +++ b/tests/unit/hive_metastore/tables/managed_mnt.json @@ -15,4 +15,4 @@ "src_table": "managed_mnt", "dst_table": "managed_mnt" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/managed_other.json b/tests/unit/hive_metastore/tables/managed_other.json index 3d97e942c6..3078b3b776 100644 --- a/tests/unit/hive_metastore/tables/managed_other.json +++ b/tests/unit/hive_metastore/tables/managed_other.json @@ -15,4 +15,4 @@ "src_table": "managed_other", "dst_table": "managed_other" } -} \ No newline at end of file +} diff --git a/tests/unit/hive_metastore/tables/tables_and_views.json b/tests/unit/hive_metastore/tables/tables_and_views.json index 0c9abd64c9..3435fd52bc 100644 --- a/tests/unit/hive_metastore/tables/tables_and_views.json +++ b/tests/unit/hive_metastore/tables/tables_and_views.json @@ -91,4 +91,4 @@ } -] \ No newline at end of file +] diff --git a/tests/unit/hive_metastore/tables/view.json b/tests/unit/hive_metastore/tables/view.json index d699377c8a..24c6efabcc 100644 --- a/tests/unit/hive_metastore/tables/view.json +++ b/tests/unit/hive_metastore/tables/view.json @@ -15,4 +15,4 @@ "src_table": "view_src", "dst_table": "view_dst" } -} \ No newline at end of file +} diff --git a/tests/unit/source_code/samples/3_SparkR_Fine Grained Demand Forecasting.r b/tests/unit/source_code/samples/3_SparkR_Fine Grained Demand Forecasting.r index 00d4fa9aa0..1cc70187b2 100644 --- a/tests/unit/source_code/samples/3_SparkR_Fine Grained Demand Forecasting.r +++ b/tests/unit/source_code/samples/3_SparkR_Fine Grained Demand Forecasting.r @@ -517,4 +517,4 @@ saveAsTable(forecast_evals, "forecast_evals_sparkr", "delta", "append", mergeSch # MAGIC | library | description | license | source | # MAGIC |----------------------------------------|-------------------------|------------|-----------------------------------------------------| # MAGIC | prophet |Implements a procedure for forecasting time series data based on an additive model | MIT | https://cran.r-project.org/web/packages/prophet/index.html | -# MAGIC | Metrics | An implementation of evaluation metrics in R that are commonly used in supervised machine learning | BSD 3 | https://cran.r-project.org/web/packages/Metrics/index.html | +# MAGIC | Metrics | An implementation of evaluation metrics in R that are commonly used in supervised machine learning | BSD 3 | https://cran.r-project.org/web/packages/Metrics/index.html | diff --git a/ucx.iml b/ucx.iml index b714dec3e7..5246a84846 100644 --- a/ucx.iml +++ b/ucx.iml @@ -10,4 +10,4 @@ - \ No newline at end of file + From eae95adcbfafe5148ef2eb4485e5a456c6045334 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:35:41 +0200 Subject: [PATCH 11/22] Infer values across notebook cells (#1968) ## Changes When linting python code, infer values using not only code from current cell but also code from previous cells ### Linked issues Progresses #1912 Progresses #1205 ### Functionality None ### Tests - [x] manually tested - [x] added unit tests Resolved 60 out of 891 "cannot be computed" advices when running make solacc --------- Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/base.py | 32 ++- .../labs/ucx/source_code/known.json | 61 ++++- .../labs/ucx/source_code/linters/context.py | 23 +- .../labs/ucx/source_code/linters/dbfs.py | 12 +- .../labs/ucx/source_code/linters/imports.py | 14 +- .../labs/ucx/source_code/linters/pyspark.py | 21 +- .../ucx/source_code/linters/python_ast.py | 223 +++--------------- .../ucx/source_code/linters/python_infer.py | 221 +++++++++++++++++ .../ucx/source_code/linters/spark_connect.py | 7 +- .../ucx/source_code/linters/table_creation.py | 7 +- .../labs/ucx/source_code/notebooks/sources.py | 11 +- .../source_code/linters/test_python_ast.py | 172 ++------------ .../source_code/linters/test_python_infer.py | 176 ++++++++++++++ .../samples/values_across_cells.py | 6 + .../unit/source_code/test_notebook_linter.py | 20 ++ 15 files changed, 608 insertions(+), 398 deletions(-) create mode 100644 src/databricks/labs/ucx/source_code/linters/python_infer.py create mode 100644 tests/unit/source_code/linters/test_python_infer.py create mode 100644 tests/unit/source_code/samples/values_across_cells.py diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index bc79231f58..ccd1a1fa7d 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -6,10 +6,11 @@ from dataclasses import dataclass from pathlib import Path -from astroid import NodeNG # type: ignore +from astroid import AstroidSyntaxError, NodeNG # type: ignore from databricks.sdk.service import compute +from databricks.labs.ucx.source_code.linters.python_ast import Tree # Code mapping between LSP, PyLint, and our own diagnostics: # | LSP | PyLint | Our | @@ -130,6 +131,16 @@ class Linter: def lint(self, code: str) -> Iterable[Advice]: ... +class PythonLinter(Linter): + + def lint(self, code: str) -> Iterable[Advice]: + tree = Tree.normalize_and_parse(code) + yield from self.lint_tree(tree) + + @abstractmethod + def lint_tree(self, tree: Tree) -> Iterable[Advice]: ... + + class Fixer: @abstractmethod def name(self) -> str: ... @@ -170,3 +181,22 @@ def __init__(self, linters: list[Linter]): def lint(self, code: str) -> Iterable[Advice]: for linter in self._linters: yield from linter.lint(code) + + +class PythonSequentialLinter(Linter): + + def __init__(self, linters: list[PythonLinter]): + self._linters = linters + self._tree: Tree | None = None + + def lint(self, code: str) -> Iterable[Advice]: + try: + tree = Tree.normalize_and_parse(code) + if self._tree is None: + self._tree = tree + else: + tree = self._tree.append_statements(tree) + for linter in self._linters: + yield from linter.lint_tree(tree) + except AstroidSyntaxError as e: + yield Failure('syntax-error', str(e), 0, 0, 0, 0) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 6a66138446..61e5b151f6 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -1265,7 +1265,7 @@ "code": "dbfs-usage", "message": "Deprecated file system path: dbfs:/" }, - { + { "code": "table-migrate", "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" } @@ -2572,6 +2572,14 @@ "dockerpycreds.utils": [], "dockerpycreds.version": [] }, + "docstring-to-markdown": { + "docstring_to_markdown": [], + "docstring_to_markdown._utils": [], + "docstring_to_markdown.cpython": [], + "docstring_to_markdown.google": [], + "docstring_to_markdown.plain": [], + "docstring_to_markdown.rst": [] + }, "entrypoints": { "entrypoints": [] }, @@ -21782,6 +21790,53 @@ "python-dateutil": { "dateutil": [] }, + "python-lsp-jsonrpc": { + "pylsp_jsonrpc": [], + "pylsp_jsonrpc._version": [], + "pylsp_jsonrpc.dispatchers": [], + "pylsp_jsonrpc.endpoint": [], + "pylsp_jsonrpc.exceptions": [], + "pylsp_jsonrpc.streams": [] + }, + "python-lsp-server": { + "pylsp": [], + "pylsp._utils": [], + "pylsp._version": [], + "pylsp.config": [], + "pylsp.config.config": [], + "pylsp.config.flake8_conf": [], + "pylsp.config.pycodestyle_conf": [], + "pylsp.config.source": [], + "pylsp.hookspecs": [], + "pylsp.lsp": [], + "pylsp.plugins": [], + "pylsp.plugins._resolvers": [], + "pylsp.plugins._rope_task_handle": [], + "pylsp.plugins.autopep8_format": [], + "pylsp.plugins.definition": [], + "pylsp.plugins.flake8_lint": [], + "pylsp.plugins.folding": [], + "pylsp.plugins.highlight": [], + "pylsp.plugins.hover": [], + "pylsp.plugins.jedi_completion": [], + "pylsp.plugins.jedi_rename": [], + "pylsp.plugins.mccabe_lint": [], + "pylsp.plugins.preload_imports": [], + "pylsp.plugins.pycodestyle_lint": [], + "pylsp.plugins.pydocstyle_lint": [], + "pylsp.plugins.pyflakes_lint": [], + "pylsp.plugins.pylint_lint": [], + "pylsp.plugins.references": [], + "pylsp.plugins.rope_autoimport": [], + "pylsp.plugins.rope_completion": [], + "pylsp.plugins.signature": [], + "pylsp.plugins.symbols": [], + "pylsp.plugins.yapf_format": [], + "pylsp.python_lsp": [], + "pylsp.text_edit": [], + "pylsp.uris": [], + "pylsp.workspace": [] + }, "pytz": { "pytz": [] }, @@ -25156,6 +25211,7 @@ "tzdata": { "tzdata": [] }, + "ujson": {}, "umap": { "umap": [], "umap.get": [] @@ -25957,5 +26013,4 @@ "zipp.compat.py310": [], "zipp.glob": [] } -} - +} \ No newline at end of file diff --git a/src/databricks/labs/ucx/source_code/linters/context.py b/src/databricks/labs/ucx/source_code/linters/context.py index d36776e46e..fad60107f9 100644 --- a/src/databricks/labs/ucx/source_code/linters/context.py +++ b/src/databricks/labs/ucx/source_code/linters/context.py @@ -1,7 +1,16 @@ +from typing import cast + from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.base import Fixer, Linter, SequentialLinter, CurrentSessionState +from databricks.labs.ucx.source_code.base import ( + Fixer, + Linter, + SequentialLinter, + CurrentSessionState, + PythonSequentialLinter, + PythonLinter, +) from databricks.labs.ucx.source_code.linters.dbfs import FromDbfsFolder, DBFSUsageLinter from databricks.labs.ucx.source_code.linters.imports import DbutilsLinter @@ -16,7 +25,7 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe self._index = index session_state = CurrentSessionState() if not session_state else session_state - python_linters: list[Linter] = [] + python_linters: list[PythonLinter] = [] python_fixers: list[Fixer] = [] sql_linters: list[Linter] = [] @@ -38,9 +47,9 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe ] sql_linters.append(FromDbfsFolder()) - self._linters = { - Language.PYTHON: SequentialLinter(python_linters), - Language.SQL: SequentialLinter(sql_linters), + self._linters: dict[Language, list[Linter] | list[PythonLinter]] = { + Language.PYTHON: python_linters, + Language.SQL: sql_linters, } self._fixers: dict[Language, list[Fixer]] = { Language.PYTHON: python_fixers, @@ -53,7 +62,9 @@ def is_supported(self, language: Language) -> bool: def linter(self, language: Language) -> Linter: if language not in self._linters: raise ValueError(f"Unsupported language: {language}") - return self._linters[language] + if language is Language.PYTHON: + return PythonSequentialLinter(cast(list[PythonLinter], self._linters[language])) + return SequentialLinter(cast(list[Linter], self._linters[language])) def fixer(self, language: Language, diagnostic_code: str) -> Fixer | None: if language not in self._fixers: diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py index b33c25698a..e5c7989793 100644 --- a/src/databricks/labs/ucx/source_code/linters/dbfs.py +++ b/src/databricks/labs/ucx/source_code/linters/dbfs.py @@ -5,8 +5,9 @@ from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError from sqlglot.expressions import Table -from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation, CurrentSessionState, Failure -from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor, InferredValue +from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation, CurrentSessionState, Failure, PythonLinter +from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor +from databricks.labs.ucx.source_code.linters.python_infer import InferredValue logger = logging.getLogger(__name__) @@ -29,7 +30,7 @@ def visit_call(self, node: Call): def _visit_arg(self, arg: NodeNG): try: - for inferred in Tree(arg).infer_values(self._session_state): + for inferred in InferredValue.infer_from_node(arg, self._session_state): if not inferred.is_inferred(): logger.debug(f"Could not infer value of {arg.as_string()}") continue @@ -64,7 +65,7 @@ def get_advices(self) -> Iterable[Advice]: yield from self._advices -class DBFSUsageLinter(Linter): +class DBFSUsageLinter(PythonLinter): def __init__(self, session_state: CurrentSessionState): self._session_state = session_state @@ -76,11 +77,10 @@ def name() -> str: """ return 'dbfs-usage' - def lint(self, code: str) -> Iterable[Advice]: + def lint_tree(self, tree: Tree) -> Iterable[Advice]: """ Lints the code looking for file system paths that are deprecated """ - tree = Tree.normalize_and_parse(code) visitor = DetectDbfsVisitor(self._session_state) visitor.visit(tree.node) yield from visitor.get_advices() diff --git a/src/databricks/labs/ucx/source_code/linters/imports.py b/src/databricks/labs/ucx/source_code/linters/imports.py index 5e408e2176..1ef187fad7 100644 --- a/src/databricks/labs/ucx/source_code/linters/imports.py +++ b/src/databricks/labs/ucx/source_code/linters/imports.py @@ -16,8 +16,9 @@ NodeNG, ) -from databricks.labs.ucx.source_code.base import Linter, Advice, Advisory, CurrentSessionState -from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase, TreeVisitor, InferredValue +from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState, PythonLinter +from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase, TreeVisitor +from databricks.labs.ucx.source_code.linters.python_infer import InferredValue logger = logging.getLogger(__name__) @@ -90,7 +91,7 @@ def get_notebook_paths(self, session_state: CurrentSessionState) -> tuple[bool, """ arg = DbutilsLinter.get_dbutils_notebook_run_path_arg(self.node) try: - all_inferred = Tree(arg).infer_values(session_state) + all_inferred = InferredValue.infer_from_node(arg, session_state) return self._get_notebook_paths(all_inferred) except InferenceError: logger.debug(f"Can't infer value(s) of {arg.as_string()}") @@ -110,13 +111,12 @@ def _get_notebook_paths(cls, all_inferred: Iterable[InferredValue]) -> tuple[boo return has_unresolved, paths -class DbutilsLinter(Linter): +class DbutilsLinter(PythonLinter): def __init__(self, session_state: CurrentSessionState): self._session_state = session_state - def lint(self, code: str) -> Iterable[Advice]: - tree = Tree.normalize_and_parse(code) + def lint_tree(self, tree: Tree) -> Iterable[Advice]: nodes = self.list_dbutils_notebook_run_calls(tree) for node in nodes: yield from self._raise_advice_if_unresolved(node.node, self._session_state) @@ -229,7 +229,7 @@ def visit_call(self, node: Call): relative = True changed = changed.args[0] try: - for inferred in Tree(changed).infer_values(self._session_state): + for inferred in InferredValue.infer_from_node(changed, self._session_state): self._visit_inferred(changed, inferred, relative, is_append) except InferenceError: self.sys_path_changes.append(UnresolvedPath(changed, changed.as_string(), is_append)) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index d78267ebcb..b56bdc544d 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -2,19 +2,19 @@ from collections.abc import Iterable, Iterator from dataclasses import dataclass -from astroid import Attribute, Call, Const, InferenceError, NodeNG, AstroidSyntaxError # type: ignore +from astroid import Attribute, Call, Const, InferenceError, NodeNG # type: ignore from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.source_code.base import ( Advice, Advisory, Deprecation, Fixer, - Linter, - Failure, CurrentSessionState, + PythonLinter, ) +from databricks.labs.ucx.source_code.linters.python_infer import InferredValue from databricks.labs.ucx.source_code.queries import FromTable -from databricks.labs.ucx.source_code.linters.python_ast import Tree, InferredValue +from databricks.labs.ucx.source_code.linters.python_ast import Tree @dataclass @@ -78,7 +78,7 @@ def lint( table_arg = self._get_table_arg(node) if table_arg: try: - for inferred in Tree(table_arg).infer_values(self.session_state): + for inferred in InferredValue.infer_from_node(table_arg, self.session_state): yield from self._lint_table_arg(from_table, node, inferred) except InferenceError: yield Advisory.from_node( @@ -114,7 +114,7 @@ def lint( ) -> Iterator[Advice]: table_arg = self._get_table_arg(node) table_name = table_arg.as_string().strip("'").strip('"') - for inferred in Tree(table_arg).infer_values(session_state): + for inferred in InferredValue.infer_from_node(table_arg, session_state): if not inferred.is_inferred(): yield Advisory.from_node( code='table-migrate-cannot-compute-value', @@ -315,7 +315,7 @@ def matchers(self): return self._matchers -class SparkSql(Linter, Fixer): +class SparkSql(PythonLinter, Fixer): _spark_matchers = SparkMatchers() @@ -328,12 +328,7 @@ def name(self) -> str: # this is the same fixer, just in a different language context return self._from_table.name() - def lint(self, code: str) -> Iterable[Advice]: - try: - tree = Tree.normalize_and_parse(code) - except AstroidSyntaxError as e: - yield Failure('syntax-error', str(e), 0, 0, 0, 0) - return + def lint_tree(self, tree: Tree) -> Iterable[Advice]: for node in tree.walk(): matcher = self._find_matcher(node) if matcher is None: diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index 3a9ca4553f..97959c3687 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -3,15 +3,21 @@ from abc import ABC import logging import re -from collections.abc import Iterable, Iterator, Generator -from typing import Any, TypeVar - -from astroid import Assign, Attribute, Call, Const, decorators, Dict, FormattedValue, Import, ImportFrom, JoinedStr, Module, Name, NodeNG, parse, Uninferable # type: ignore -from astroid.context import InferenceContext, InferenceResult, CallContext # type: ignore -from astroid.typing import InferenceErrorInfo # type: ignore -from astroid.exceptions import InferenceError # type: ignore - -from databricks.labs.ucx.source_code.base import CurrentSessionState +from collections.abc import Iterable +from typing import TypeVar, cast + +from astroid import ( # type: ignore + Assign, + Attribute, + Call, + Const, + Import, + ImportFrom, + Module, + Name, + NodeNG, + parse, +) logger = logging.getLogger(__name__) @@ -192,191 +198,20 @@ def _get_attribute_value(cls, node: Attribute): logger.debug(f"Missing handler for {name}") return None - def infer_values(self, state: CurrentSessionState | None = None) -> Iterable[InferredValue]: - self._contextualize(state) - for inferred_atoms in self._infer_values(): - yield InferredValue(inferred_atoms) - - def _contextualize(self, state: CurrentSessionState | None): - if state is None or state.named_parameters is None or len(state.named_parameters) == 0: - return - self._contextualize_dbutils_widgets_get(state) - self._contextualize_dbutils_widgets_get_all(state) - - def _contextualize_dbutils_widgets_get(self, state: CurrentSessionState): - calls = Tree(self.root).locate(Call, [("get", Attribute), ("widgets", Attribute), ("dbutils", Name)]) - for call in calls: - call.func = _DbUtilsWidgetsGetCall(state, call) - - def _contextualize_dbutils_widgets_get_all(self, state: CurrentSessionState): - calls = Tree(self.root).locate(Call, [("getAll", Attribute), ("widgets", Attribute), ("dbutils", Name)]) - for call in calls: - call.func = _DbUtilsWidgetsGetAllCall(state, call) - - def _infer_values(self) -> Iterator[Iterable[NodeNG]]: - # deal with node types that don't implement 'inferred()' - if self._node is Uninferable or isinstance(self._node, Const): - yield [self._node] - elif isinstance(self._node, JoinedStr): - yield from self._infer_values_from_joined_string() - elif isinstance(self._node, FormattedValue): - yield from _LocalTree(self._node.value).do_infer_values() - else: - yield from self._infer_internal() - - def _infer_internal(self): - try: - for inferred in self._node.inferred(): - # work around infinite recursion of empty lists - if inferred == self._node: - continue - yield from _LocalTree(inferred).do_infer_values() - except InferenceError as e: - logger.debug(f"When inferring {self._node}", exc_info=e) - yield [Uninferable] - - def _infer_values_from_joined_string(self) -> Iterator[Iterable[NodeNG]]: - assert isinstance(self._node, JoinedStr) - yield from self._infer_values_from_joined_values(self._node.values) - - @classmethod - def _infer_values_from_joined_values(cls, nodes: list[NodeNG]) -> Iterator[Iterable[NodeNG]]: - if len(nodes) == 1: - yield from _LocalTree(nodes[0]).do_infer_values() - return - for firsts in _LocalTree(nodes[0]).do_infer_values(): - for remains in cls._infer_values_from_joined_values(nodes[1:]): - yield list(firsts) + list(remains) - - -class _LocalTree(Tree): - """class that avoids pylint W0212 protected-access warning""" - - def do_infer_values(self): - return self._infer_values() - - -class _DbUtilsWidgetsGetCall(NodeNG): - - def __init__(self, session_state: CurrentSessionState, node: NodeNG): - super().__init__( - lineno=node.lineno, - col_offset=node.col_offset, - end_lineno=node.end_lineno, - end_col_offset=node.end_col_offset, - parent=node.parent, - ) - self._session_state = session_state - - @decorators.raise_if_nothing_inferred - def _infer( - self, context: InferenceContext | None = None, **kwargs: Any - ) -> Generator[InferenceResult, None, InferenceErrorInfo | None]: - yield self - return InferenceErrorInfo(node=self, context=context) - - def infer_call_result(self, context: InferenceContext | None = None, **_): # caller needs unused kwargs - call_context = getattr(context, "callcontext", None) - if not isinstance(call_context, CallContext): - yield Uninferable - return - arg = call_context.args[0] - for inferred in Tree(arg).infer_values(self._session_state): - if not inferred.is_inferred(): - yield Uninferable - continue - name = inferred.as_string() - named_parameters = self._session_state.named_parameters - if not named_parameters or name not in named_parameters: - yield Uninferable - continue - value = named_parameters[name] - yield Const( - value, - lineno=self.lineno, - col_offset=self.col_offset, - end_lineno=self.end_lineno, - end_col_offset=self.end_col_offset, - parent=self, - ) - - -class _DbUtilsWidgetsGetAllCall(NodeNG): - - def __init__(self, session_state: CurrentSessionState, node: NodeNG): - super().__init__( - lineno=node.lineno, - col_offset=node.col_offset, - end_lineno=node.end_lineno, - end_col_offset=node.end_col_offset, - parent=node.parent, - ) - self._session_state = session_state - - @decorators.raise_if_nothing_inferred - def _infer( - self, context: InferenceContext | None = None, **kwargs: Any - ) -> Generator[InferenceResult, None, InferenceErrorInfo | None]: - yield self - return InferenceErrorInfo(node=self, context=context) - - def infer_call_result(self, **_): # caller needs unused kwargs - named_parameters = self._session_state.named_parameters - if not named_parameters: - yield Uninferable - return - items = self._populate_items(named_parameters) - result = Dict( - lineno=self.lineno, - col_offset=self.col_offset, - end_lineno=self.end_lineno, - end_col_offset=self.end_col_offset, - parent=self, - ) - result.postinit(items) - yield result - - def _populate_items(self, values: dict[str, str]): - items: list[tuple[InferenceResult, InferenceResult]] = [] - for key, value in values.items(): - item_key = Const( - key, - lineno=self.lineno, - col_offset=self.col_offset, - end_lineno=self.end_lineno, - end_col_offset=self.end_col_offset, - parent=self, - ) - item_value = Const( - value, - lineno=self.lineno, - col_offset=self.col_offset, - end_lineno=self.end_lineno, - end_col_offset=self.end_col_offset, - parent=self, - ) - items.append((item_key, item_value)) - return items - - -class InferredValue: - """Represents 1 or more nodes that together represent the value. - The list of nodes typically holds one Const element, but for f-strings it - can hold multiple ones, including Uninferable nodes.""" - - def __init__(self, atoms: Iterable[NodeNG]): - self._atoms = list(atoms) - - @property - def nodes(self): - return self._atoms - - def is_inferred(self): - return all(atom is not Uninferable for atom in self._atoms) - - def as_string(self): - strings = [str(const.value) for const in filter(lambda atom: isinstance(atom, Const), self._atoms)] - return "".join(strings) + def append_statements(self, tree: Tree) -> Tree: + if not isinstance(tree.node, Module): + raise NotImplementedError(f"Can't append statements from {type(tree.node).__name__}") + tree_module: Module = cast(Module, tree.node) + if not isinstance(self.node, Module): + raise NotImplementedError(f"Can't append statements to {type(self.node).__name__}") + self_module: Module = cast(Module, self.node) + for stmt in tree_module.body: + stmt.parent = self_module + self_module.body.append(stmt) + for name, value in tree_module.globals.items(): + self_module.globals[name] = value + # the following may seem strange but it's actually ok to use the original module as tree root + return tree class TreeVisitor: diff --git a/src/databricks/labs/ucx/source_code/linters/python_infer.py b/src/databricks/labs/ucx/source_code/linters/python_infer.py new file mode 100644 index 0000000000..073ab362a6 --- /dev/null +++ b/src/databricks/labs/ucx/source_code/linters/python_infer.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +import logging +from collections.abc import Iterable, Iterator, Generator +from typing import Any + +from astroid import ( # type: ignore + Attribute, + Call, + Const, + decorators, + Dict, + FormattedValue, + JoinedStr, + Name, + NodeNG, + Uninferable, +) +from astroid.context import InferenceContext, InferenceResult, CallContext # type: ignore +from astroid.typing import InferenceErrorInfo # type: ignore +from astroid.exceptions import InferenceError # type: ignore + +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.linters.python_ast import Tree + +logger = logging.getLogger(__name__) + + +class InferredValue: + """Represents 1 or more nodes that together represent the value. + The list of nodes typically holds one Const element, but for f-strings it + can hold multiple ones, including Uninferable nodes.""" + + @classmethod + def infer_from_node(cls, node: NodeNG, state: CurrentSessionState | None = None) -> Iterable[InferredValue]: + cls._contextualize(node, state) + for inferred_atoms in cls._infer_values(node): + yield InferredValue(inferred_atoms) + + @classmethod + def _contextualize(cls, node: NodeNG, state: CurrentSessionState | None): + if state is None or state.named_parameters is None or len(state.named_parameters) == 0: + return + cls._contextualize_dbutils_widgets_get(node, state) + cls._contextualize_dbutils_widgets_get_all(node, state) + + @classmethod + def _contextualize_dbutils_widgets_get(cls, node: NodeNG, state: CurrentSessionState): + root = Tree(node).root + calls = Tree(root).locate(Call, [("get", Attribute), ("widgets", Attribute), ("dbutils", Name)]) + for call in calls: + call.func = _DbUtilsWidgetsGetCall(state, call) + + @classmethod + def _contextualize_dbutils_widgets_get_all(cls, node: NodeNG, state: CurrentSessionState): + root = Tree(node).root + calls = Tree(root).locate(Call, [("getAll", Attribute), ("widgets", Attribute), ("dbutils", Name)]) + for call in calls: + call.func = _DbUtilsWidgetsGetAllCall(state, call) + + @classmethod + def _infer_values(cls, node: NodeNG) -> Iterator[Iterable[NodeNG]]: + # deal with node types that don't implement 'inferred()' + if node is Uninferable or isinstance(node, Const): + yield [node] + elif isinstance(node, JoinedStr): + yield from cls._infer_values_from_joined_string(node) + elif isinstance(node, FormattedValue): + yield from _LocalInferredValue.do_infer_values(node.value) + else: + yield from cls._infer_internal(node) + + @classmethod + def _infer_internal(cls, node: NodeNG): + try: + for inferred in node.inferred(): + # work around infinite recursion of empty lists + if inferred == node: + continue + yield from _LocalInferredValue.do_infer_values(inferred) + except InferenceError as e: + logger.debug(f"When inferring {node}", exc_info=e) + yield [Uninferable] + + @classmethod + def _infer_values_from_joined_string(cls, node: NodeNG) -> Iterator[Iterable[NodeNG]]: + assert isinstance(node, JoinedStr) + yield from cls._infer_values_from_joined_values(node.values) + + @classmethod + def _infer_values_from_joined_values(cls, nodes: list[NodeNG]) -> Iterator[Iterable[NodeNG]]: + if len(nodes) == 1: + yield from _LocalInferredValue.do_infer_values(nodes[0]) + return + for firsts in _LocalInferredValue.do_infer_values(nodes[0]): + for remains in cls._infer_values_from_joined_values(nodes[1:]): + yield list(firsts) + list(remains) + + def __init__(self, atoms: Iterable[NodeNG]): + self._atoms = list(atoms) + + @property + def nodes(self): + return self._atoms + + def is_inferred(self): + return all(atom is not Uninferable for atom in self._atoms) + + def as_string(self): + strings = [str(const.value) for const in filter(lambda atom: isinstance(atom, Const), self._atoms)] + return "".join(strings) + + +class _DbUtilsWidgetsGetCall(NodeNG): + + def __init__(self, session_state: CurrentSessionState, node: NodeNG): + super().__init__( + lineno=node.lineno, + col_offset=node.col_offset, + end_lineno=node.end_lineno, + end_col_offset=node.end_col_offset, + parent=node.parent, + ) + self._session_state = session_state + + @decorators.raise_if_nothing_inferred + def _infer( + self, context: InferenceContext | None = None, **kwargs: Any + ) -> Generator[InferenceResult, None, InferenceErrorInfo | None]: + yield self + return InferenceErrorInfo(node=self, context=context) + + def infer_call_result(self, context: InferenceContext | None = None, **_): # caller needs unused kwargs + call_context = getattr(context, "callcontext", None) + if not isinstance(call_context, CallContext): + yield Uninferable + return + arg = call_context.args[0] + for inferred in InferredValue.infer_from_node(arg, self._session_state): + if not inferred.is_inferred(): + yield Uninferable + continue + name = inferred.as_string() + named_parameters = self._session_state.named_parameters + if not named_parameters or name not in named_parameters: + yield Uninferable + continue + value = named_parameters[name] + yield Const( + value, + lineno=self.lineno, + col_offset=self.col_offset, + end_lineno=self.end_lineno, + end_col_offset=self.end_col_offset, + parent=self, + ) + + +class _LocalInferredValue(InferredValue): + + @classmethod + def do_infer_values(cls, node: NodeNG): + yield from cls._infer_values(node) + + +class _DbUtilsWidgetsGetAllCall(NodeNG): + + def __init__(self, session_state: CurrentSessionState, node: NodeNG): + super().__init__( + lineno=node.lineno, + col_offset=node.col_offset, + end_lineno=node.end_lineno, + end_col_offset=node.end_col_offset, + parent=node.parent, + ) + self._session_state = session_state + + @decorators.raise_if_nothing_inferred + def _infer( + self, context: InferenceContext | None = None, **kwargs: Any + ) -> Generator[InferenceResult, None, InferenceErrorInfo | None]: + yield self + return InferenceErrorInfo(node=self, context=context) + + def infer_call_result(self, **_): # caller needs unused kwargs + named_parameters = self._session_state.named_parameters + if not named_parameters: + yield Uninferable + return + items = self._populate_items(named_parameters) + result = Dict( + lineno=self.lineno, + col_offset=self.col_offset, + end_lineno=self.end_lineno, + end_col_offset=self.end_col_offset, + parent=self, + ) + result.postinit(items) + yield result + + def _populate_items(self, values: dict[str, str]): + items: list[tuple[InferenceResult, InferenceResult]] = [] + for key, value in values.items(): + item_key = Const( + key, + lineno=self.lineno, + col_offset=self.col_offset, + end_lineno=self.end_lineno, + end_col_offset=self.end_col_offset, + parent=self, + ) + item_value = Const( + value, + lineno=self.lineno, + col_offset=self.col_offset, + end_lineno=self.end_lineno, + end_col_offset=self.end_col_offset, + parent=self, + ) + items.append((item_key, item_value)) + return items diff --git a/src/databricks/labs/ucx/source_code/linters/spark_connect.py b/src/databricks/labs/ucx/source_code/linters/spark_connect.py index 6b3b2f0d28..962057d2c8 100644 --- a/src/databricks/labs/ucx/source_code/linters/spark_connect.py +++ b/src/databricks/labs/ucx/source_code/linters/spark_connect.py @@ -6,7 +6,7 @@ from databricks.labs.ucx.source_code.base import ( Advice, Failure, - Linter, + PythonLinter, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree @@ -172,7 +172,7 @@ def _match_jvm_log(self, node: NodeNG) -> Iterator[Advice]: ) -class SparkConnectLinter(Linter): +class SparkConnectLinter(PythonLinter): def __init__(self, is_serverless: bool = False): self._matchers = [ JvmAccessMatcher(is_serverless=is_serverless), @@ -181,7 +181,6 @@ def __init__(self, is_serverless: bool = False): LoggingMatcher(is_serverless=is_serverless), ] - def lint(self, code: str) -> Iterator[Advice]: - tree = Tree.normalize_and_parse(code) + def lint_tree(self, tree: Tree) -> Iterator[Advice]: for matcher in self._matchers: yield from matcher.lint_tree(tree.node) diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py index 5d2b27044a..147c72e30c 100644 --- a/src/databricks/labs/ucx/source_code/linters/table_creation.py +++ b/src/databricks/labs/ucx/source_code/linters/table_creation.py @@ -7,7 +7,7 @@ from databricks.labs.ucx.source_code.base import ( Advice, - Linter, + PythonLinter, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree @@ -92,7 +92,7 @@ def lint(self, node: NodeNG) -> Iterator[Advice]: ) -class DBRv8d0Linter(Linter): +class DBRv8d0Linter(PythonLinter): """Performs Python linting for backwards incompatible changes in DBR version 8.0. Specifically, it yields advice for table-creation with implicit format. """ @@ -111,9 +111,8 @@ def __init__(self, dbr_version: tuple[int, int] | None): ] ) - def lint(self, code: str) -> Iterable[Advice]: + def lint_tree(self, tree: Tree) -> Iterable[Advice]: if self._skip_dbr: return - tree = Tree.normalize_and_parse(code) for node in tree.walk(): yield from self._linter.lint(node) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 42177876af..1da4afaf73 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -9,7 +9,7 @@ from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.base import Advice, Failure +from databricks.labs.ucx.source_code.base import Advice, Failure, Linter from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem from databricks.labs.ucx.source_code.linters.context import LinterContext @@ -87,6 +87,8 @@ class NotebookLinter: def __init__(self, langs: LinterContext, notebook: Notebook): self._languages: LinterContext = langs self._notebook: Notebook = notebook + # reuse Python linter, which accumulates statements for improved inference + self._python_linter = langs.linter(Language.PYTHON) @classmethod def from_source(cls, index: MigrationIndex, source: str, default_language: Language) -> 'NotebookLinter': @@ -99,13 +101,18 @@ def lint(self) -> Iterable[Advice]: for cell in self._notebook.cells: if not self._languages.is_supported(cell.language.language): continue - linter = self._languages.linter(cell.language.language) + linter = self._linter(cell.language.language) for advice in linter.lint(cell.original_code): yield advice.replace( start_line=advice.start_line + cell.original_offset, end_line=advice.end_line + cell.original_offset, ) + def _linter(self, language: Language) -> Linter: + if language is Language.PYTHON: + return self._python_linter + return self._languages.linter(language) + @staticmethod def name() -> str: return "notebook-linter" diff --git a/tests/unit/source_code/linters/test_python_ast.py b/tests/unit/source_code/linters/test_python_ast.py index 51e0f7ed78..47569c7321 100644 --- a/tests/unit/source_code/linters/test_python_ast.py +++ b/tests/unit/source_code/linters/test_python_ast.py @@ -1,8 +1,8 @@ import pytest from astroid import Assign, AstroidSyntaxError, Attribute, Call, Const, Expr # type: ignore -from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.linters.python_infer import InferredValue def test_extracts_root(): @@ -99,163 +99,6 @@ def test_tree_walks_nodes_once(): assert len(nodes) == count -def test_infers_empty_list(): - tree = Tree.parse("a=[]") - nodes = tree.locate(Assign, []) - tree = Tree(nodes[0].value) - values = list(tree.infer_values()) - assert not values - - -def test_infers_empty_tuple(): - tree = Tree.parse("a=tuple()") - nodes = tree.locate(Assign, []) - tree = Tree(nodes[0].value) - values = list(tree.infer_values()) - assert not values - - -def test_infers_empty_set(): - tree = Tree.parse("a={}") - nodes = tree.locate(Assign, []) - tree = Tree(nodes[0].value) - values = list(tree.infer_values()) - assert not values - - -def test_infers_fstring_value(): - source = """ -value = "abc" -fstring = f"Hello {value}!" -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[1].value) # value of fstring = ... - values = list(tree.infer_values()) - assert all(value.is_inferred() for value in values) - strings = list(value.as_string() for value in values) - assert strings == ["Hello abc!"] - - -def test_infers_string_format_value(): - source = """ -value = "abc" -fstring = "Hello {0}!".format(value) -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[1].value) # value of fstring = ... - values = list(tree.infer_values()) - assert all(value.is_inferred() for value in values) - strings = list(value.as_string() for value in values) - assert strings == ["Hello abc!"] - - -def test_infers_fstring_values(): - source = """ -values_1 = ["abc", "def"] -for value1 in values_1: - values_2 = ["ghi", "jkl"] - for value2 in values_2: - fstring = f"Hello {value1}, {value2}!" -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[2].value) # value of fstring = ... - values = list(tree.infer_values()) - assert all(value.is_inferred() for value in values) - strings = list(value.as_string() for value in values) - assert strings == ["Hello abc, ghi!", "Hello abc, jkl!", "Hello def, ghi!", "Hello def, jkl!"] - - -def test_fails_to_infer_cascading_fstring_values(): - # The purpose of this test is to detect a change in astroid support for f-strings - source = """ -value1 = "John" -value2 = f"Hello {value1}" -value3 = f"{value2}, how are you today?" -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[2].value) # value of value3 = ... - values = list(tree.infer_values()) - # for now, we simply check failure to infer! - assert any(not value.is_inferred() for value in values) - # the expected value would be ["Hello John, how are you today?"] - - -def test_infers_externally_defined_value(): - state = CurrentSessionState() - state.named_parameters = {"my-widget": "my-value"} - source = """ -name = "my-widget" -value = dbutils.widgets.get(name) -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[1].value) # value of value = ... - values = list(tree.infer_values(state)) - strings = list(value.as_string() for value in values) - assert strings == ["my-value"] - - -def test_infers_externally_defined_values(): - state = CurrentSessionState() - state.named_parameters = {"my-widget-1": "my-value-1", "my-widget-2": "my-value-2"} - source = """ -for name in ["my-widget-1", "my-widget-2"]: - value = dbutils.widgets.get(name) -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[0].value) # value of value = ... - values = list(tree.infer_values(state)) - strings = list(value.as_string() for value in values) - assert strings == ["my-value-1", "my-value-2"] - - -def test_fails_to_infer_missing_externally_defined_value(): - state = CurrentSessionState() - state.named_parameters = {"my-widget-1": "my-value-1", "my-widget-2": "my-value-2"} - source = """ -name = "my-widget" -value = dbutils.widgets.get(name) -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[1].value) # value of value = ... - values = tree.infer_values(state) - assert all(not value.is_inferred() for value in values) - - -def test_survives_absence_of_externally_defined_values(): - source = """ - name = "my-widget" - value = dbutils.widgets.get(name) - """ - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[1].value) # value of value = ... - values = tree.infer_values(CurrentSessionState()) - assert all(not value.is_inferred() for value in values) - - -def test_infers_externally_defined_value_set(): - state = CurrentSessionState() - state.named_parameters = {"my-widget": "my-value"} - source = """ -values = dbutils.widgets.getAll() -name = "my-widget" -value = values[name] -""" - tree = Tree.parse(source) - nodes = tree.locate(Assign, []) - tree = Tree(nodes[2].value) # value of value = ... - values = list(tree.infer_values(state)) - strings = list(value.as_string() for value in values) - assert strings == ["my-value"] - - def test_parses_incorrectly_indented_code(): source = """# DBTITLE 1,Get Sales Data for Analysis sales = ( @@ -291,3 +134,16 @@ def test_ignores_magic_marker_in_multiline_comment(): """ Tree.normalize_and_parse(source) assert True + + +def test_appends_statements(): + source_1 = "a = 'John'" + tree_1 = Tree.normalize_and_parse(source_1) + source_2 = 'b = f"Hello {a}!"' + tree_2 = Tree.normalize_and_parse(source_2) + tree_3 = tree_1.append_statements(tree_2) + nodes = tree_3.locate(Assign, []) + tree = Tree(nodes[0].value) # tree_3 only contains tree_2 statements + values = list(InferredValue.infer_from_node(tree.node)) + strings = list(value.as_string() for value in values) + assert strings == ["Hello John!"] diff --git a/tests/unit/source_code/linters/test_python_infer.py b/tests/unit/source_code/linters/test_python_infer.py new file mode 100644 index 0000000000..f838cb7154 --- /dev/null +++ b/tests/unit/source_code/linters/test_python_infer.py @@ -0,0 +1,176 @@ +from astroid import Assign # type: ignore + +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.linters.python_infer import InferredValue + + +def test_infers_empty_list(): + tree = Tree.parse("a=[]") + nodes = tree.locate(Assign, []) + tree = Tree(nodes[0].value) + values = list(InferredValue.infer_from_node(tree.node)) + assert not values + + +def test_infers_empty_tuple(): + tree = Tree.parse("a=tuple()") + nodes = tree.locate(Assign, []) + tree = Tree(nodes[0].value) + values = list(InferredValue.infer_from_node(tree.node)) + assert not values + + +def test_infers_empty_set(): + tree = Tree.parse("a={}") + nodes = tree.locate(Assign, []) + tree = Tree(nodes[0].value) + values = list(InferredValue.infer_from_node(tree.node)) + assert not values + + +def test_infers_fstring_value(): + source = """ +value = "abc" +fstring = f"Hello {value}!" +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[1].value) # value of fstring = ... + values = list(InferredValue.infer_from_node(tree.node)) + assert all(value.is_inferred() for value in values) + strings = list(value.as_string() for value in values) + assert strings == ["Hello abc!"] + + +def test_infers_fstring_dict_value(): + source = """ +value = { "abc": 123 } +fstring = f"Hello {value['abc']}!" +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[1].value) # value of fstring = ... + values = list(InferredValue.infer_from_node(tree.node)) + assert all(value.is_inferred() for value in values) + strings = list(value.as_string() for value in values) + assert strings == ["Hello 123!"] + + +def test_infers_string_format_value(): + source = """ +value = "abc" +fstring = "Hello {0}!".format(value) +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[1].value) # value of fstring = ... + values = list(InferredValue.infer_from_node(tree.node)) + assert all(value.is_inferred() for value in values) + strings = list(value.as_string() for value in values) + assert strings == ["Hello abc!"] + + +def test_infers_fstring_values(): + source = """ +values_1 = ["abc", "def"] +for value1 in values_1: + values_2 = ["ghi", "jkl"] + for value2 in values_2: + fstring = f"Hello {value1}, {value2}!" +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[2].value) # value of fstring = ... + values = list(InferredValue.infer_from_node(tree.node)) + assert all(value.is_inferred() for value in values) + strings = list(value.as_string() for value in values) + assert strings == ["Hello abc, ghi!", "Hello abc, jkl!", "Hello def, ghi!", "Hello def, jkl!"] + + +def test_fails_to_infer_cascading_fstring_values(): + # The purpose of this test is to detect a change in astroid support for f-strings + source = """ +value1 = "John" +value2 = f"Hello {value1}" +value3 = f"{value2}, how are you today?" +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[2].value) # value of value3 = ... + values = list(InferredValue.infer_from_node(tree.node)) + # for now, we simply check failure to infer! + assert any(not value.is_inferred() for value in values) + # the expected value would be ["Hello John, how are you today?"] + + +def test_infers_externally_defined_value(): + state = CurrentSessionState() + state.named_parameters = {"my-widget": "my-value"} + source = """ +name = "my-widget" +value = dbutils.widgets.get(name) +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[1].value) # value of value = ... + values = list(InferredValue.infer_from_node(tree.node, state)) + strings = list(value.as_string() for value in values) + assert strings == ["my-value"] + + +def test_infers_externally_defined_values(): + state = CurrentSessionState() + state.named_parameters = {"my-widget-1": "my-value-1", "my-widget-2": "my-value-2"} + source = """ +for name in ["my-widget-1", "my-widget-2"]: + value = dbutils.widgets.get(name) +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[0].value) # value of value = ... + values = list(InferredValue.infer_from_node(tree.node, state)) + strings = list(value.as_string() for value in values) + assert strings == ["my-value-1", "my-value-2"] + + +def test_fails_to_infer_missing_externally_defined_value(): + state = CurrentSessionState() + state.named_parameters = {"my-widget-1": "my-value-1", "my-widget-2": "my-value-2"} + source = """ +name = "my-widget" +value = dbutils.widgets.get(name) +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[1].value) # value of value = ... + values = InferredValue.infer_from_node(tree.node, state) + assert all(not value.is_inferred() for value in values) + + +def test_survives_absence_of_externally_defined_values(): + source = """ + name = "my-widget" + value = dbutils.widgets.get(name) + """ + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[1].value) # value of value = ... + values = InferredValue.infer_from_node(tree.node, CurrentSessionState()) + assert all(not value.is_inferred() for value in values) + + +def test_infers_externally_defined_value_set(): + state = CurrentSessionState() + state.named_parameters = {"my-widget": "my-value"} + source = """ +values = dbutils.widgets.getAll() +name = "my-widget" +value = values[name] +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[2].value) # value of value = ... + values = list(InferredValue.infer_from_node(tree.node, state)) + strings = list(value.as_string() for value in values) + assert strings == ["my-value"] diff --git a/tests/unit/source_code/samples/values_across_cells.py b/tests/unit/source_code/samples/values_across_cells.py new file mode 100644 index 0000000000..fe7366eeb5 --- /dev/null +++ b/tests/unit/source_code/samples/values_across_cells.py @@ -0,0 +1,6 @@ +# Databricks notebook source +a = 12 + +# COMMAND ---------- + +spark.table(f"{a}") diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py index d507f03b38..18b4956cc6 100644 --- a/tests/unit/source_code/test_notebook_linter.py +++ b/tests/unit/source_code/test_notebook_linter.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pytest from databricks.sdk.service.workspace import Language @@ -547,3 +549,21 @@ def test_notebook_linter_tracks_use(extended_test_index, lang, source, expected) assert linter is not None advices = list(linter.lint()) assert advices == expected + + +def test_computes_values_across_cells(extended_test_index, mock_path_lookup): + path = mock_path_lookup.resolve(Path("values_across_cells.py")) + source = path.read_text() + linter = NotebookLinter.from_source(extended_test_index, source, Language.PYTHON) + advices = list(linter.lint()) + expected = [ + Advice( + code='table-migrate', + message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta', + start_line=5, + start_col=0, + end_line=5, + end_col=19, + ) + ] + assert advices == expected From c5ed423c0f208070e932bcecbe7a7465a67ba2cb Mon Sep 17 00:00:00 2001 From: qziyuan <91635877+qziyuan@users.noreply.github.com> Date: Fri, 5 Jul 2024 00:40:25 -0700 Subject: [PATCH 12/22] Do not migrate READ_METADATA to BROWSE on tables and schemas (#2022) ## Changes UC only support BROWSE privilege on catalog object. Translate legacy hive_metastore privilege READ_METADATA on tables and databases to BROWSE privilege on UC tables and schemas will fail and cause error messages in the migrate tables workflow logs, such error messages will confuse the users. --- src/databricks/labs/ucx/hive_metastore/grants.py | 3 --- tests/unit/hive_metastore/test_grants.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 9a25c3ef4a..e39d857516 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -167,11 +167,9 @@ def uc_grant_sql(self, object_type: str | None = None, object_key: str | None = ("FUNCTION", "SELECT"): self._uc_action("EXECUTE"), ("TABLE", "SELECT"): self._uc_action("SELECT"), ("TABLE", "MODIFY"): self._uc_action("MODIFY"), - ("TABLE", "READ_METADATA"): self._uc_action("BROWSE"), ("TABLE", "ALL PRIVILEGES"): self._uc_action("ALL PRIVILEGES"), ("TABLE", "OWN"): self._set_owner_sql, ("VIEW", "SELECT"): self._uc_action("SELECT"), - ("VIEW", "READ_METADATA"): self._uc_action("BROWSE"), ("VIEW", "OWN"): self._set_owner_sql, ("DATABASE", "USAGE"): self._uc_action("USE SCHEMA"), ("DATABASE", "CREATE"): self._uc_action("CREATE TABLE"), @@ -179,7 +177,6 @@ def uc_grant_sql(self, object_type: str | None = None, object_key: str | None = ("DATABASE", "SELECT"): self._uc_action("SELECT"), ("DATABASE", "MODIFY"): self._uc_action("MODIFY"), ("DATABASE", "OWN"): self._set_owner_sql, - ("DATABASE", "READ_METADATA"): self._uc_action("BROWSE"), ("CATALOG", "OWN"): self._set_owner_sql, ("CATALOG", "USAGE"): self._uc_action("USE CATALOG"), } diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index d45ed10033..877ffafdcf 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -117,7 +117,7 @@ def test_hive_deny_sql(): [ ( Grant("user", "READ_METADATA", catalog="hive_metastore", database="mydb", table="mytable"), - "GRANT BROWSE ON TABLE hive_metastore.mydb.mytable TO `user`", + None, ), ( Grant("me", "OWN", catalog="hive_metastore", database="mydb", table="mytable"), From a760927d52e4cef41065a4170163a4279e7d0821 Mon Sep 17 00:00:00 2001 From: Cor Date: Fri, 5 Jul 2024 09:41:05 +0200 Subject: [PATCH 13/22] Log the right amount of lint problems (#2024) ## Changes Log the right amount of lint problems, now it is less than the actual number. Incorrect message is introduced in #1956, fixing here. --- src/databricks/labs/ucx/source_code/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 636d57e608..aa82e9557f 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -298,8 +298,8 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): tasks.append(functools.partial(self.lint_job, job.job_id)) logger.info(f"Running {tasks} linting tasks in parallel...") job_problems, errors = Threads.gather('linting workflows', tasks) - logger.info(f"Saving {len(job_problems)} linting problems...") job_problems_flattened = list(itertools.chain(*job_problems)) + logger.info(f"Saving {len(job_problems_flattened)} linting problems...") sql_backend.save_table( f'{inventory_database}.workflow_problems', job_problems_flattened, From cd6ad77f28b3e3f091ef1ffeded1854af6e07659 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 5 Jul 2024 09:41:22 +0200 Subject: [PATCH 14/22] Update sqlglot requirement from <25.5,>=25.4.1 to >=25.5.0,<25.6 (#2084) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates the requirements on [sqlglot](https://github.com/tobymao/sqlglot) to permit the latest version.
Changelog

Sourced from sqlglot's changelog.

[v25.5.0] - 2024-07-04

:boom: BREAKING CHANGES

:sparkles: New Features

:bug: Bug Fixes

:recycle: Refactors

[v25.4.1] - 2024-06-29

:bug: Bug Fixes

[v25.4.0] - 2024-06-28

:boom: BREAKING CHANGES

... (truncated)

Commits
  • 912bc84 feat(spark, databricks): Support view schema binding options (#3739)
  • c790c3b Fix(tsql): parse rhs of x::varchar(max) into a type (#3737)
  • 820d664 Feat(presto): wrap md5 string arguments in to_utf8 (#3732)
  • fb066a6 fix(oracle)!: Decouple NVL() from COALESCE() (#3734)
  • e5a53aa feat(snowflake): Support for FROM CHANGES (#3731)
  • 1e07c4d feat(presto, trino): Configurable transpilation of Snowflake VARIANT (#3725)
  • 8335ba1 Fix(clickhouse)!: preserve EXTRACT(date_part FROM datetime) calls (#3729)
  • f4a2872 Fix(clickhouse): switch off table alias columns generation (#3727)
  • 37b6e2d Feat(snowflake): add support for VECTOR(type, size) (#3724)
  • 84416d2 Refactor(teradata): clean up CurrentTimestamp generation logic
  • Additional commits viewable in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2b95c7d04f..82a74805e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ dependencies = ["databricks-sdk>=0.27,<0.30", "databricks-labs-lsql>=0.4,<0.6", "databricks-labs-blueprint>=0.6.0", "PyYAML>=6.0.0,<7.0.0", - "sqlglot>=25.4.1,<25.5", + "sqlglot>=25.5.0,<25.6", "astroid>=3.2.2"] [project.optional-dependencies] From 0bc144897e0fc81318d72f57af0136cd705712b9 Mon Sep 17 00:00:00 2001 From: qziyuan <91635877+qziyuan@users.noreply.github.com> Date: Fri, 5 Jul 2024 00:42:43 -0700 Subject: [PATCH 15/22] Handle exceptions with no error_code attribute while crawling permissions (#2079) Avoid `AttributeError` error when exceptions without an `error_code` attribute are caught during the assessment job crawling permissions. ### Linked issues Resolves #2078 ### Tests - [ ] manually tested - [ x] added unit tests - [ ] added integration tests - [ ] verified on staging environment (screenshot attached) --- .../labs/ucx/workspace_access/manager.py | 8 ++++---- tests/unit/workspace_access/test_manager.py | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/workspace_access/manager.py b/src/databricks/labs/ucx/workspace_access/manager.py index e09fc849e3..99b0efa71e 100644 --- a/src/databricks/labs/ucx/workspace_access/manager.py +++ b/src/databricks/labs/ucx/workspace_access/manager.py @@ -32,11 +32,11 @@ def inventorize_permissions(self): items, errors = Threads.gather("crawl permissions", crawler_tasks) acute_errors = [] for error in errors: - if error.error_code not in self.ERRORS_TO_IGNORE: - logger.error(f"Error while crawling permissions: {error}") - acute_errors.append(error) + if hasattr(error, 'error_code') and error.error_code in self.ERRORS_TO_IGNORE: + logger.info(f"Error while crawling permissions: {error}. Skipping") continue - logger.info(f"Error while crawling permissions: {error}. Skipping") + logger.error(f"Error while crawling permissions: {error}") + acute_errors.append(error) if len(acute_errors) > 0: raise ManyError(acute_errors) logger.info(f"Total crawled permissions: {len(items)}") diff --git a/tests/unit/workspace_access/test_manager.py b/tests/unit/workspace_access/test_manager.py index 7bd46376f8..2f0cc00b1f 100644 --- a/tests/unit/workspace_access/test_manager.py +++ b/tests/unit/workspace_access/test_manager.py @@ -8,6 +8,7 @@ from databricks.sdk.errors import DatabricksError from databricks.sdk.service import iam +from databricks.labs.blueprint.parallel import ManyError from databricks.labs.ucx.workspace_access.base import AclSupport from databricks.labs.ucx.workspace_access.groups import MigratedGroup, MigrationState from databricks.labs.ucx.workspace_access.manager import PermissionManager, Permissions @@ -107,6 +108,25 @@ def raise_error(): ) +def test_manager_inventorize_fail_with_error(mock_backend, mocker): + def raise_error(): + raise DatabricksError( + "Fail the job", + error_code="NO_SKIP", + ) + + def raise_error_no_code(): + raise TimeoutError + + some_crawler = mocker.Mock() + some_crawler.get_crawler_tasks = lambda: [lambda: Permissions("a", "b", "c"), raise_error, raise_error_no_code] + permission_manager = PermissionManager(mock_backend, "test_database", [some_crawler]) + + with pytest.raises(ManyError) as expected_err: + permission_manager.inventorize_permissions() + assert len(expected_err.value.errs) == 2 + + def test_manager_apply(mocker): sql_backend = MockBackend( rows={ From 7589721b84682f3ac18a04fafadf8a9fa0de6e54 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:45:38 +0200 Subject: [PATCH 16/22] whitelist mmf_sa (#2028) ## Changes whitelist mmf_sa ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/known.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 61e5b151f6..1ac0485d38 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -5572,6 +5572,9 @@ "megatron.utils.pipeline": [], "megatron.visuals": [] }, + "mmf_sa": { + "mmf_sa": [] + }, "ml-dtypes": { "ml_dtypes": [], "ml_dtypes._finfo": [], From ca708f1e540193ba9ac83aa9ecf31163b018c60e Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:47:58 +0200 Subject: [PATCH 17/22] whitelist h3 (#2030) ## Changes whitelist h3 ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 1ac0485d38..ad8760bdd3 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -3468,6 +3468,29 @@ "gunicorn": { "gunicorn": [] }, + "h3": { + ".........setup": [], + "h3": [], + "h3._cy": [], + "h3._version": [], + "h3.api": [], + "h3.api._api_template": [], + "h3.api.basic_int": [], + "h3.api.basic_int._binding": [], + "h3.api.basic_int._public_api": [], + "h3.api.basic_str": [], + "h3.api.basic_str._binding": [], + "h3.api.basic_str._public_api": [], + "h3.api.memview_int": [], + "h3.api.memview_int._binding": [], + "h3.api.memview_int._public_api": [], + "h3.api.numpy_int": [], + "h3.api.numpy_int._binding": [], + "h3.api.numpy_int._public_api": [], + "h3.unstable": [], + "h3.unstable.v4": [], + "h3.unstable.vect": [] + }, "h5py": { "h5py": [] }, From 58de7869686ef97573e502ac84c98f71783c391d Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:48:25 +0200 Subject: [PATCH 18/22] whitelist bioinfokit (#2031) ## Changes whitelist bioinfokit ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/known-readme.txt | 0 src/databricks/labs/ucx/source_code/known.json | 6 ++++++ 2 files changed, 6 insertions(+) create mode 100644 src/databricks/labs/ucx/source_code/known-readme.txt diff --git a/src/databricks/labs/ucx/source_code/known-readme.txt b/src/databricks/labs/ucx/source_code/known-readme.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index ad8760bdd3..0886bb3f12 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -960,6 +960,12 @@ "bidict._typing": [], "bidict.metadata": [] }, + "bioinfokit": { + "bioinfokit": [], + "bioinfokit.analys": [], + "bioinfokit.help": [], + "bioinfokit.visuz": [] + }, "black": { "_black_version": [], "black": [], From 0b7df886d7cd9b5cc020a08357ab0bd6881157b2 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:50:47 +0200 Subject: [PATCH 19/22] whitelist fastapi (#2034) ## Changes whitelist fastapi ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 0886bb3f12..89db02391f 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -3248,6 +3248,57 @@ "faker.utils.loading": [], "faker.utils.text": [] }, + "fastapi": { + "fastapi": [], + "fastapi._compat": [], + "fastapi.applications": [], + "fastapi.background": [], + "fastapi.concurrency": [], + "fastapi.datastructures": [], + "fastapi.dependencies": [], + "fastapi.dependencies.models": [], + "fastapi.dependencies.utils": [], + "fastapi.encoders": [], + "fastapi.exception_handlers": [], + "fastapi.exceptions": [], + "fastapi.logger": [], + "fastapi.middleware": [], + "fastapi.middleware.cors": [], + "fastapi.middleware.gzip": [], + "fastapi.middleware.httpsredirect": [], + "fastapi.middleware.trustedhost": [], + "fastapi.middleware.wsgi": [], + "fastapi.openapi": [], + "fastapi.openapi.constants": [], + "fastapi.openapi.docs": [], + "fastapi.openapi.models": [], + "fastapi.openapi.utils": [], + "fastapi.param_functions": [], + "fastapi.params": [], + "fastapi.requests": [], + "fastapi.responses": [], + "fastapi.routing": [], + "fastapi.security": [], + "fastapi.security.api_key": [], + "fastapi.security.base": [], + "fastapi.security.http": [], + "fastapi.security.oauth2": [], + "fastapi.security.open_id_connect_url": [], + "fastapi.security.utils": [], + "fastapi.staticfiles": [], + "fastapi.templating": [], + "fastapi.testclient": [], + "fastapi.types": [], + "fastapi.utils": [], + "fastapi.websockets": [] + }, + "fastapi-cli": { + "fastapi_cli": [], + "fastapi_cli.cli": [], + "fastapi_cli.discover": [], + "fastapi_cli.exceptions": [], + "fastapi_cli.logging": [] + }, "fastcluster": { "fastcluster": [] }, From 19a592f209506124e6c085af90f86a69ef9f0ece Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:51:54 +0200 Subject: [PATCH 20/22] whitelist tenacity (#2036) ## Changes whitelist tenacity ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/known.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 89db02391f..9c22d2cdae 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -24954,6 +24954,20 @@ "isympy": [], "sympy": [] }, + "tenacity": { + "tenacity": [], + "tenacity._utils": [], + "tenacity.after": [], + "tenacity.asyncio": [], + "tenacity.asyncio.retry": [], + "tenacity.before": [], + "tenacity.before_sleep": [], + "tenacity.nap": [], + "tenacity.retry": [], + "tenacity.stop": [], + "tenacity.tornadoweb": [], + "tenacity.wait": [] + }, "tensorboard": { "tensorboard": [] }, From a7006b6ff2eb5f2e13cf7e22479f62428fe869bf Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:54:32 +0200 Subject: [PATCH 21/22] whitelist sentence-transformers (#2038) ## Changes whitelist sentence-transformers ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/known.json | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 9c22d2cdae..c7820707ee 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -22041,6 +22041,101 @@ "seaborn.utils": [], "seaborn.widgets": [] }, + "sentence-transformers": { + "sentence_transformers.LoggingHandler": [], + "sentence_transformers.SentenceTransformer": [], + "sentence_transformers": [], + "sentence_transformers.cross_encoder.CrossEncoder": [], + "sentence_transformers.cross_encoder": [], + "sentence_transformers.cross_encoder.evaluation.CEBinaryAccuracyEvaluator": [], + "sentence_transformers.cross_encoder.evaluation.CEBinaryClassificationEvaluator": [], + "sentence_transformers.cross_encoder.evaluation.CECorrelationEvaluator": [], + "sentence_transformers.cross_encoder.evaluation.CEF1Evaluator": [], + "sentence_transformers.cross_encoder.evaluation.CERerankingEvaluator": [], + "sentence_transformers.cross_encoder.evaluation.CESoftmaxAccuracyEvaluator": [], + "sentence_transformers.cross_encoder.evaluation": [], + "sentence_transformers.data_collator": [], + "sentence_transformers.datasets.DenoisingAutoEncoderDataset": [], + "sentence_transformers.datasets.NoDuplicatesDataLoader": [], + "sentence_transformers.datasets.ParallelSentencesDataset": [], + "sentence_transformers.datasets.SentenceLabelDataset": [], + "sentence_transformers.datasets.SentencesDataset": [], + "sentence_transformers.datasets": [], + "sentence_transformers.evaluation.BinaryClassificationEvaluator": [], + "sentence_transformers.evaluation.EmbeddingSimilarityEvaluator": [], + "sentence_transformers.evaluation.InformationRetrievalEvaluator": [], + "sentence_transformers.evaluation.LabelAccuracyEvaluator": [], + "sentence_transformers.evaluation.MSEEvaluator": [], + "sentence_transformers.evaluation.MSEEvaluatorFromDataFrame": [], + "sentence_transformers.evaluation.ParaphraseMiningEvaluator": [], + "sentence_transformers.evaluation.RerankingEvaluator": [], + "sentence_transformers.evaluation.SentenceEvaluator": [], + "sentence_transformers.evaluation.SequentialEvaluator": [], + "sentence_transformers.evaluation.SimilarityFunction": [], + "sentence_transformers.evaluation.TranslationEvaluator": [], + "sentence_transformers.evaluation.TripletEvaluator": [], + "sentence_transformers.evaluation": [], + "sentence_transformers.fit_mixin": [], + "sentence_transformers.losses.AdaptiveLayerLoss": [], + "sentence_transformers.losses.AnglELoss": [], + "sentence_transformers.losses.BatchAllTripletLoss": [], + "sentence_transformers.losses.BatchHardSoftMarginTripletLoss": [], + "sentence_transformers.losses.BatchHardTripletLoss": [], + "sentence_transformers.losses.BatchSemiHardTripletLoss": [], + "sentence_transformers.losses.CachedGISTEmbedLoss": [], + "sentence_transformers.losses.CachedMultipleNegativesRankingLoss": [], + "sentence_transformers.losses.CoSENTLoss": [], + "sentence_transformers.losses.ContrastiveLoss": [], + "sentence_transformers.losses.ContrastiveTensionLoss": [], + "sentence_transformers.losses.CosineSimilarityLoss": [], + "sentence_transformers.losses.DenoisingAutoEncoderLoss": [], + "sentence_transformers.losses.GISTEmbedLoss": [], + "sentence_transformers.losses.MSELoss": [], + "sentence_transformers.losses.MarginMSELoss": [], + "sentence_transformers.losses.Matryoshka2dLoss": [], + "sentence_transformers.losses.MatryoshkaLoss": [], + "sentence_transformers.losses.MegaBatchMarginLoss": [], + "sentence_transformers.losses.MultipleNegativesRankingLoss": [], + "sentence_transformers.losses.MultipleNegativesSymmetricRankingLoss": [], + "sentence_transformers.losses.OnlineContrastiveLoss": [], + "sentence_transformers.losses.SoftmaxLoss": [], + "sentence_transformers.losses.TripletLoss": [], + "sentence_transformers.losses": [], + "sentence_transformers.model_card": [], + "sentence_transformers.model_card_templates": [], + "sentence_transformers.models.Asym": [], + "sentence_transformers.models.BoW": [], + "sentence_transformers.models.CLIPModel": [], + "sentence_transformers.models.CNN": [], + "sentence_transformers.models.Dense": [], + "sentence_transformers.models.Dropout": [], + "sentence_transformers.models.LSTM": [], + "sentence_transformers.models.LayerNorm": [], + "sentence_transformers.models.Normalize": [], + "sentence_transformers.models.Pooling": [], + "sentence_transformers.models.Transformer": [], + "sentence_transformers.models.WeightedLayerPooling": [], + "sentence_transformers.models.WordEmbeddings": [], + "sentence_transformers.models.WordWeights": [], + "sentence_transformers.models": [], + "sentence_transformers.models.tokenizer.PhraseTokenizer": [], + "sentence_transformers.models.tokenizer.WhitespaceTokenizer": [], + "sentence_transformers.models.tokenizer.WordTokenizer": [], + "sentence_transformers.models.tokenizer": [], + "sentence_transformers.quantization": [], + "sentence_transformers.readers.InputExample": [], + "sentence_transformers.readers.LabelSentenceReader": [], + "sentence_transformers.readers.NLIDataReader": [], + "sentence_transformers.readers.PairedFilesReader": [], + "sentence_transformers.readers.STSDataReader": [], + "sentence_transformers.readers.TripletReader": [], + "sentence_transformers.readers": [], + "sentence_transformers.sampler": [], + "sentence_transformers.similarity_functions": [], + "sentence_transformers.trainer": [], + "sentence_transformers.training_args": [], + "sentence_transformers.util": [] + }, "sentry-sdk": { "sentry_sdk": [], "sentry_sdk.integrations.spark.spark_driver": [ From 2c2f09e9c43fa067c4a50a5ef271379fd6a7e649 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 5 Jul 2024 09:55:01 +0200 Subject: [PATCH 22/22] whitelist opencv-python (#2039) ## Changes whitelist opencv-python ### Linked issues Progresses #1901 ### Functionality None ### Tests - [x] manually tested Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/known.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index c7820707ee..4e75421b28 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -6901,6 +6901,21 @@ "pydevd_plugins.extensions": [], "pydevd_plugins.extensions.pydevd_plugin_omegaconf": [] }, + "opencv-python": { + "cv2": [], + "cv2.config-3": [], + "cv2.config": [], + "cv2.data": [], + "cv2.gapi": [], + "cv2.load_config_py2": [], + "cv2.load_config_py3": [], + "cv2.mat_wrapper": [], + "cv2.misc": [], + "cv2.misc.version": [], + "cv2.typing": [], + "cv2.utils": [], + "cv2.version": [] + }, "opentelemetry-api": { "opentelemetry._logs": [], "opentelemetry._logs._internal": [],