Skip to content

Commit 2219a4f

Browse files
committed
fix: misidentify-column-name-as-alias (#539)
1 parent 0326b59 commit 2219a4f

File tree

4 files changed

+140
-21
lines changed

4 files changed

+140
-21
lines changed

sqllineage/core/models.py

+1
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ def _to_src_col(
223223
else:
224224
# select unqualified column
225225
source = _to_src_col(src_col, None)
226+
setattr(source, "has_qualifier", False)
226227
for table in set(alias_mapping.values()):
227228
# in case of only one table, we get the right answer
228229
# in case of multiple tables, a bunch of possible tables are set

sqllineage/core/parser/__init__.py

+27-20
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,15 @@ def end_of_query_cleanup(self, holder: SubQueryLineageHolder) -> None:
2727
lateral_aliases = set()
2828
for idx, tgt_col in enumerate(col_grp):
2929
tgt_col.parent = tgt_tbl
30-
for lateral_alias_ref in col_grp[idx + 1 :]: # noqa: E203
31-
if any(
32-
src_col[0] == tgt_col.raw_name
33-
for src_col in lateral_alias_ref.source_columns
34-
):
35-
lateral_aliases.add(tgt_col.raw_name)
36-
break
30+
if hasattr(tgt_col, "has_alias") and tgt_col.has_alias is True:
31+
for lateral_alias_ref in col_grp[idx + 1 :]: # noqa: E203
32+
if any(
33+
src_col[0] == tgt_col.raw_name
34+
for src_col in lateral_alias_ref.source_columns
35+
if src_col[1] is None
36+
):
37+
lateral_aliases.add(tgt_col.raw_name)
38+
break
3739
for src_col in tgt_col.to_source_columns(
3840
holder.get_alias_mapping_from_table_group(tbl_grp)
3941
):
@@ -45,19 +47,24 @@ def end_of_query_cleanup(self, holder: SubQueryLineageHolder) -> None:
4547
# when the length doesn't match, we fall back to default behavior
4648
tgt_col = write_columns[idx]
4749
is_lateral_alias_ref = False
48-
for wc in holder.write_columns:
49-
if wc.raw_name == "*":
50-
continue
51-
if (
52-
src_col.raw_name == wc.raw_name
53-
and src_col.raw_name in lateral_aliases
54-
):
55-
is_lateral_alias_ref = True
56-
for lateral_alias_col in holder.get_source_columns(wc):
57-
holder.add_column_lineage(
58-
lateral_alias_col, tgt_col
59-
)
60-
break
50+
if idx > 0:
51+
for wc in holder.write_columns:
52+
if wc.raw_name == "*":
53+
continue
54+
if (
55+
hasattr(src_col, "has_qualifier")
56+
and src_col.has_qualifier is False
57+
and src_col.raw_name == wc.raw_name
58+
and src_col.raw_name in lateral_aliases
59+
):
60+
is_lateral_alias_ref = True
61+
for lateral_alias_col in holder.get_source_columns(
62+
wc
63+
):
64+
holder.add_column_lineage(
65+
lateral_alias_col, tgt_col
66+
)
67+
break
6168
if is_lateral_alias_ref:
6269
continue
6370
holder.add_column_lineage(src_col, tgt_col)

sqllineage/core/parser/sqlfluff/models.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,12 @@ def of(column: BaseSegment, **kwargs) -> Column:
105105
if column.type == "select_clause_element":
106106
source_columns, alias = SqlFluffColumn._get_column_and_alias(column)
107107
if alias:
108-
return Column(
108+
alias_column = Column(
109109
alias,
110110
source_columns=source_columns,
111111
)
112+
setattr(alias_column, "has_alias", True)
113+
return alias_column
112114
if source_columns:
113115
column_name = None
114116
for sub_segment in list_child_segments(column):

tests/sql/column/test_column_select_lateral_alias_ref.py

+109
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,113 @@ def test_column_top_level_lateral_ref():
2727
ColumnQualifierTuple("id", "public.tgt_tbl1"),
2828
),
2929
],
30+
test_sqlparse=False,
31+
)
32+
sql = """
33+
insert into public.tgt_tbl1
34+
(
35+
name,
36+
email
37+
)
38+
select
39+
st1.name,
40+
st1.name || st1.email || '@gmail.com' as email
41+
from
42+
public.src_tbl1 as st1
43+
"""
44+
assert_column_lineage_equal(
45+
sql,
46+
[
47+
(
48+
ColumnQualifierTuple("name", "public.src_tbl1"),
49+
ColumnQualifierTuple("name", "public.tgt_tbl1"),
50+
),
51+
(
52+
ColumnQualifierTuple("name", "public.src_tbl1"),
53+
ColumnQualifierTuple("email", "public.tgt_tbl1"),
54+
),
55+
(
56+
ColumnQualifierTuple("email", "public.src_tbl1"),
57+
ColumnQualifierTuple("email", "public.tgt_tbl1"),
58+
),
59+
],
60+
test_sqlparse=False,
61+
)
62+
sql = """
63+
insert into public.tgt_tbl1
64+
(
65+
id,
66+
id_original
67+
)
68+
select
69+
'a || b || c' || id as id,
70+
id as id_original
71+
from
72+
public.src_tbl1
73+
"""
74+
assert_column_lineage_equal(
75+
sql,
76+
[
77+
(
78+
ColumnQualifierTuple("id", "public.src_tbl1"),
79+
ColumnQualifierTuple("id", "public.tgt_tbl1"),
80+
),
81+
(
82+
ColumnQualifierTuple("id", "public.src_tbl1"),
83+
ColumnQualifierTuple("id_original", "public.tgt_tbl1"),
84+
),
85+
],
86+
test_sqlparse=False,
87+
)
88+
sql = """
89+
insert into public.tgt_tbl1
90+
(
91+
id,
92+
id_original
93+
)
94+
select
95+
a || b || c || id as id,
96+
id as id_original -- # noqa: E501 TODO: I need the metadata information for the table public.src_tbl1 to identify whether the column reference 'id' in this context is from the table public.src_tbl1 or from an alias reference, currently being used as an alias reference. Note: This decision may significantly deviate from the actual scenario.
97+
from
98+
public.src_tbl1
99+
"""
100+
assert_column_lineage_equal(
101+
sql,
102+
[
103+
(
104+
ColumnQualifierTuple("a", "public.src_tbl1"),
105+
ColumnQualifierTuple("id", "public.tgt_tbl1"),
106+
),
107+
(
108+
ColumnQualifierTuple("b", "public.src_tbl1"),
109+
ColumnQualifierTuple("id", "public.tgt_tbl1"),
110+
),
111+
(
112+
ColumnQualifierTuple("c", "public.src_tbl1"),
113+
ColumnQualifierTuple("id", "public.tgt_tbl1"),
114+
),
115+
(
116+
ColumnQualifierTuple("id", "public.src_tbl1"),
117+
ColumnQualifierTuple("id", "public.tgt_tbl1"),
118+
),
119+
(
120+
ColumnQualifierTuple("a", "public.src_tbl1"),
121+
ColumnQualifierTuple("id_original", "public.tgt_tbl1"),
122+
),
123+
(
124+
ColumnQualifierTuple("b", "public.src_tbl1"),
125+
ColumnQualifierTuple("id_original", "public.tgt_tbl1"),
126+
),
127+
(
128+
ColumnQualifierTuple("c", "public.src_tbl1"),
129+
ColumnQualifierTuple("id_original", "public.tgt_tbl1"),
130+
),
131+
(
132+
ColumnQualifierTuple("id", "public.src_tbl1"),
133+
ColumnQualifierTuple("id_original", "public.tgt_tbl1"),
134+
),
135+
],
136+
test_sqlparse=False,
30137
)
31138

32139

@@ -60,6 +167,7 @@ def test_column_lateral_ref_within_subquery():
60167
ColumnQualifierTuple("name", "public.tgt_tbl1"),
61168
),
62169
],
170+
test_sqlparse=False,
63171
)
64172

65173
sql = """
@@ -95,4 +203,5 @@ def test_column_lateral_ref_within_subquery():
95203
ColumnQualifierTuple("name", "public.tgt_tbl1"),
96204
),
97205
],
206+
test_sqlparse=False,
98207
)

0 commit comments

Comments
 (0)