Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(#13734): Properly escape special characters in CSV output #13735

Merged
merged 5 commits into from
Mar 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions superset/charts/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,12 +484,7 @@ def get_data_response(
if result_format == ChartDataResultFormat.CSV:
# return the first result
data = result["queries"][0]["data"]
return CsvResponse(
data,
status=200,
headers=generate_download_headers("csv"),
mimetype="application/csv",
)
return CsvResponse(data, headers=generate_download_headers("csv"))

if result_format == ChartDataResultFormat.JSON:
response_data = simplejson.dumps(
Expand Down
5 changes: 4 additions & 1 deletion superset/common/query_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
)
from superset.extensions import cache_manager, security_manager
from superset.stats_logger import BaseStatsLogger
from superset.utils import csv
from superset.utils.cache import generate_cache_key, set_and_log_cache
from superset.utils.core import (
ChartDataResultFormat,
Expand Down Expand Up @@ -151,7 +152,9 @@ def df_metrics_to_num(df: pd.DataFrame, query_object: QueryObject) -> None:
def get_data(self, df: pd.DataFrame,) -> Union[str, List[Dict[str, Any]]]:
if self.result_format == ChartDataResultFormat.CSV:
include_index = not isinstance(df.index, pd.RangeIndex)
result = df.to_csv(index=include_index, **config["CSV_EXPORT"])
result = csv.df_to_escaped_csv(
df, index=include_index, **config["CSV_EXPORT"]
)
return result or ""

return df.to_dict(orient="records")
Expand Down
67 changes: 67 additions & 0 deletions superset/utils/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import re
from typing import Any

import pandas as pd

negative_number_re = re.compile(r"^-[0-9.]+$")

# This regex will match if the string starts with:
#
# 1. one of -, @, +, |, =, %
# 2. two double quotes immediately followed by one of -, @, +, |, =, %
# 3. one or more spaces immediately followed by one of -, @, +, |, =, %
#
problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]')
Comment on lines +24 to +30
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a FYI, in Python you can write verbose regular expressions with inline comments: https://docs.python.org/3/library/re.html#re.VERBOSE

You can also use + instead of {1,}, but no need to change this.



def escape_value(value: str) -> str:
"""
Escapes a set of special characters.

http://georgemauer.net/2017/10/07/csv-injection.html
"""
needs_escaping = problematic_chars_re.match(value) is not None
is_negative_number = negative_number_re.match(value) is not None

if needs_escaping and not is_negative_number:
# Escape pipe to be extra safe as this
# can lead to remote code execution
value = value.replace("|", "\\|")

# Precede the line with a single quote. This prevents
# evaluation of commands and some spreadsheet software
# will hide this visually from the user. Many articles
# claim a preceding space will work here too, however,
# when uploading a csv file in Google sheets, a leading
# space was ignored and code was still evaluated.
value = "'" + value

return value


def df_to_escaped_csv(df: pd.DataFrame, **kwargs: Any) -> Any:
escape_values = lambda v: escape_value(v) if isinstance(v, str) else v

# Escape csv headers
df = df.rename(columns=escape_values)

# Escape csv rows
df = df.applymap(escape_values)

return df.to_csv(**kwargs)
1 change: 1 addition & 0 deletions superset/views/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ class CsvResponse(Response): # pylint: disable=too-many-ancestors
"""

charset = conf["CSV_EXPORT"].get("encoding", "utf-8")
default_mimetype = "text/csv"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍



def check_ownership(obj: Any, raise_if_false: bool = True) -> bool:
Expand Down
17 changes: 5 additions & 12 deletions superset/views/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
from superset.sql_validators import get_validator_by_name
from superset.tasks.async_queries import load_explore_json_into_cache
from superset.typing import FlaskResponse
from superset.utils import core as utils
from superset.utils import core as utils, csv
from superset.utils.async_query_manager import AsyncQueryTokenException
from superset.utils.cache import etag_cache
from superset.utils.core import ReservedUrlParameters
Expand Down Expand Up @@ -437,10 +437,7 @@ def generate_json(
) -> FlaskResponse:
if response_type == utils.ChartDataResultFormat.CSV:
return CsvResponse(
viz_obj.get_csv(),
status=200,
headers=generate_download_headers("csv"),
mimetype="application/csv",
viz_obj.get_csv(), headers=generate_download_headers("csv")
)

if response_type == utils.ChartDataResultType.QUERY:
Expand Down Expand Up @@ -2628,18 +2625,14 @@ def csv(self, client_id: str) -> FlaskResponse: # pylint: disable=no-self-use
columns = [c["name"] for c in obj["columns"]]
df = pd.DataFrame.from_records(obj["data"], columns=columns)
logger.info("Using pandas to convert to CSV")
csv = df.to_csv(index=False, **config["CSV_EXPORT"])
else:
logger.info("Running a query to turn into CSV")
sql = query.select_sql or query.executed_sql
df = query.database.get_df(sql, query.schema)
# TODO(bkyryliuk): add compression=gzip for big files.
csv = df.to_csv(index=False, **config["CSV_EXPORT"])
response = Response(csv, mimetype="text/csv")
csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
quoted_csv_name = parse.quote(query.name)
response.headers["Content-Disposition"] = (
f'attachment; filename="{quoted_csv_name}.csv"; '
f"filename*=UTF-8''{quoted_csv_name}.csv"
response = CsvResponse(
csv_data, headers=generate_download_headers("csv", quoted_csv_name)
)
event_info = {
"event_type": "data_export",
Expand Down
4 changes: 2 additions & 2 deletions superset/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
from superset.models.cache import CacheKey
from superset.models.helpers import QueryResult
from superset.typing import QueryObjectDict, VizData, VizPayload
from superset.utils import core as utils
from superset.utils import core as utils, csv
from superset.utils.cache import set_and_log_cache
from superset.utils.core import (
DTTM_ALIAS,
Expand Down Expand Up @@ -615,7 +615,7 @@ def data(self) -> Dict[str, Any]:
def get_csv(self) -> Optional[str]:
df = self.get_df_payload()["df"] # leverage caching logic
include_index = not isinstance(df.index, pd.RangeIndex)
return df.to_csv(index=include_index, **config["CSV_EXPORT"])
return csv.df_to_escaped_csv(df, index=include_index, **config["CSV_EXPORT"])

def get_data(self, df: pd.DataFrame) -> VizData:
return df.to_dict(orient="records")
Expand Down
80 changes: 80 additions & 0 deletions tests/utils/csv_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=no-self-use
import io

import pandas as pd
import pytest

from superset.utils import csv


def test_escape_value():
result = csv.escape_value("value")
assert result == "value"

result = csv.escape_value("-10")
assert result == "-10"

result = csv.escape_value("@value")
assert result == "'@value"

result = csv.escape_value("+value")
assert result == "'+value"

result = csv.escape_value("-value")
assert result == "'-value"

result = csv.escape_value("=value")
assert result == "'=value"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing some testing in the ephemeral env, I'm curious why just the single quote at the beginning rather than wrapping the value?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The defusedcsv package that you reference above states:

Excel will not display the ' character to the user

but this is not the case in my tests, where Excel displays the following: '=10+20

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TLDR: it was what I saw recommended the most when googling around.

Longer: I saw a few different recommendations but they all seem to be intrusive in one form or another. The approach I saw most often recommended was the single preceding quote. There is also a tab approach, but that looks invasive to some degree as well.

However, now that I just went back to look this up again, I found a post that shows how even this is not sufficient:

One product in my organization fixed a similar issue with 2 layers of defense. For any CSV cell value that starts with +, -, @, = (as suggested in http://georgemauer.net/2017/10/07/csv-injection.html or OWASP) the fix added (1) a preceding TAB char, (2) single quotes around the cell value. But later the we found that adding a single / pair of double quote(s) before the attacker's payload can simply evade the filter to check for the chars +, -, @, =. e.g. if the payload attacker injects is =2+5+cmd|' /C calc'!A0 filter will catch it and mitigate the risks. But if attacker puts the payload ""=2+5+cmd|' /C calc'!A0 filter won't be able to catch it as it was checking for only values starting with +, -, @, =. The end result will be same because MS excel, while rendering the value, simply skips the leading double quotes in CSV values.

Their recommendation is:

prepend a SPACE or TAB or SINGLE QUOTE to ALL CSV values before exporting them to file. This mitigation leaves the CSV file human readable but not executable. DO NOT check for leading +, -, @, =, " and prepend to only those values.

Does that sound like a valid approach or do you have a preferred alternative?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatives that I see are:

  1. Alter every column with a preceding space. Intrusive, but not as noticeable as a single quote and will, according to that post, solve the issue in all cases
  2. Try to be more clever with a regex to account for the quote issue he mentions and hope this captures everything
  3. Keep current solution (maybe change to preceding space) and ignore the quote issue. Less invasive, but still problematic.

We can also wrap the whole column with single quotes, but then the UX is worse in some spreadsheet software that hides the preceding quote from user visually.

Mitigating this is going to be intrusive no matter which solution we choose. I'd recommend 1 above if we want to be covered in all cases, but also happy to go with 2 if we feel confident that the the quote issue is the only other case to account for.

Thoughts?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think option 1 is too intrusive. Option 2 or shipping as-is are viable options. Curious what others think here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like 2 is doable. I agree with @robdiciuccio that 1 is intrusive.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the implementation to be a bit more clever after some more testing in google sheets and excel. Best I can tell, this gets the job done without being overly intrusive.

Some interesting things I found while testing:

  • Many resources say that a leading space will work the same as a leading single quote, but I found that Google sheets will ignore a leading space and evaluate the command
  • Google Sheets seems to hide leading single quote while Excel did not
  • Pandas seems to remove the leading double quotes


result = csv.escape_value("|value")
assert result == "'\|value"

result = csv.escape_value("%value")
assert result == "'%value"

result = csv.escape_value("=cmd|' /C calc'!A0")
assert result == "'=cmd\|' /C calc'!A0"

result = csv.escape_value('""=10+2')
assert result == '\'""=10+2'

result = csv.escape_value(" =10+2")
assert result == "' =10+2"


def test_df_to_escaped_csv():
csv_rows = [
["col_a", "=func()"],
["-10", "=cmd|' /C calc'!A0"],
["a", '""=b'],
[" =a", "b"],
]
csv_str = "\n".join([",".join(row) for row in csv_rows])

df = pd.read_csv(io.StringIO(csv_str))

escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False)
escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")]

assert escaped_csv_rows == [
["col_a", "'=func()"],
["-10", "'=cmd\|' /C calc'!A0"],
["a", "'=b"], # pandas seems to be removing the leading ""
["' =a", "b"],
]