Support reading multipart data with \n (LF) lines (#3492)

kxepal · asvetlov · commit fcedc6603c18 · 2019-01-15T09:01:13.000+02:00
* Support reading multipart data with `\n` (`LF`) lines

While RFC clearly says about `CRLF` newlines, there quite a lot of
implementations which uses just `LF`. Even Python's stdlib produces
multiparts with `\n` newlines by default for compatibility reasons.

We wouldn't change how we produce multipart content - here we follow
RFC. However, we can detect `\n` lines quite easily which makes their
support quite cheap.

* Add test about mixed newlines

Just for case. That's a strange case, but it seems we pass it.

* Make newline argument as keyword one and explicitly private one

This argument is not designed to be defined by users. Depending on
parsing multipart newline format it will be chosen automatically and
due to recursive native of multipart format it have to be passed around
for nested readers.
diff --git a/CHANGES/2302.feature b/CHANGES/2302.feature
@@ -0,0 +1 @@
+Support reading multipart data with `\n` (`LF`) lines
diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py
@@ -237,11 +237,17 @@ class BodyPartReader:
 
     chunk_size = 8192
 
-    def __init__(self, boundary: bytes,
-                 headers: Mapping[str, Optional[str]],
-                 content: StreamReader) -> None:
+    def __init__(
+        self,
+        boundary: bytes,
+        headers: Mapping[str, Optional[str]],
+        content: StreamReader,
+        *,
+        _newline: bytes = b'\r\n'
+    ) -> None:
         self.headers = headers
         self._boundary = boundary
+        self._newline = _newline
         self._content = content
         self._at_eof = False
         length = self.headers.get(CONTENT_LENGTH, None)
@@ -300,8 +306,8 @@ async def read_chunk(self, size: int=chunk_size) -> bytes:
         if self._read_bytes == self._length:
             self._at_eof = True
         if self._at_eof:
-            clrf = await self._content.readline()
-            assert b'\r\n' == clrf, \
+            newline = await self._content.readline()
+            assert newline == self._newline, \
                 'reader did not read all the data or it is malformed'
         return chunk
 
@@ -328,11 +334,15 @@ async def _read_chunk_from_stream(self, size: int) -> bytes:
         assert self._content_eof < 3, "Reading after EOF"
         assert self._prev_chunk is not None
         window = self._prev_chunk + chunk
-        sub = b'\r\n' + self._boundary
+
+        intermeditate_boundary = self._newline + self._boundary
+
         if first_chunk:
-            idx = window.find(sub)
+            pos = 0
         else:
-            idx = window.find(sub, max(0, len(self._prev_chunk) - len(sub)))
+            pos = max(0, len(self._prev_chunk) - len(intermeditate_boundary))
+
+        idx = window.find(intermeditate_boundary, pos)
         if idx >= 0:
             # pushing boundary back to content
             with warnings.catch_warnings():
@@ -344,6 +354,7 @@ async def _read_chunk_from_stream(self, size: int) -> bytes:
             chunk = window[len(self._prev_chunk):idx]
             if not chunk:
                 self._at_eof = True
+
         result = self._prev_chunk
         self._prev_chunk = chunk
         return result
@@ -372,7 +383,8 @@ async def readline(self) -> bytes:
         else:
             next_line = await self._content.readline()
             if next_line.startswith(self._boundary):
-                line = line[:-2]  # strip CRLF but only once
+                # strip newline but only once
+                line = line[:-len(self._newline)]
             self._unread.append(next_line)
 
         return line
@@ -516,10 +528,16 @@ class MultipartReader:
     #: Body part reader class for non multipart/* content types.
     part_reader_cls = BodyPartReader
 
-    def __init__(self, headers: Mapping[str, str],
-                 content: StreamReader) -> None:
+    def __init__(
+        self,
+        headers: Mapping[str, str],
+        content: StreamReader,
+        *,
+        _newline: bytes = b'\r\n'
+    ) -> None:
         self.headers = headers
         self._boundary = ('--' + self._get_boundary()).encode()
+        self._newline = _newline
         self._content = content
         self._last_part = None
         self._at_eof = False
@@ -592,9 +610,13 @@ def _get_part_reader(self, headers: 'CIMultiDictProxy[str]') -> Any:
         if mimetype.type == 'multipart':
             if self.multipart_reader_cls is None:
                 return type(self)(headers, self._content)
-            return self.multipart_reader_cls(headers, self._content)
+            return self.multipart_reader_cls(
+                headers, self._content, _newline=self._newline
+            )
         else:
-            return self.part_reader_cls(self._boundary, headers, self._content)
+            return self.part_reader_cls(
+                self._boundary, headers, self._content, _newline=self._newline
+            )
 
     def _get_boundary(self) -> str:
         mimetype = parse_mimetype(self.headers[CONTENT_TYPE])
@@ -625,6 +647,11 @@ async def _read_until_first_boundary(self) -> None:
             if chunk == b'':
                 raise ValueError("Could not find starting boundary %r"
                                  % (self._boundary))
+            if chunk.startswith(self._boundary):
+                _, newline = chunk.split(self._boundary, 1)
+                assert newline in (b'\r\n', b'\n')
+                self._newline = newline
+
             chunk = chunk.rstrip()
             if chunk == self._boundary:
                 return
diff --git a/tests/test_multipart.py b/tests/test_multipart.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Support reading multipart data with `\n` (`LF`) lines