Response bodies can be empty or missing + HttpParser refactoring (#688)

abhinavsingh · web-flow · commit d9200fd082e0 · 2021-11-06T22:07:11.000+05:30
* Responses can have None body, remove assertions, update modify chunk plugin to not modify chunks for responses with no content

* Address mypy warning after removing assertion

* Reusable get_body_or_chunks

* Order methods by public/private, mark private ones with _ prefix

* HttpParser.url deprecation notice (renamed to _url).  Add zero-copy todo
diff --git a/proxy/http/parser.py b/proxy/http/parser.py
@@ -43,35 +43,59 @@
 
 
 class HttpParser:
-    """HTTP request/response parser."""
+    """HTTP request/response parser.
+
+    TODO: Make me zero-copy by using memoryview.
+    Currently due to chunk/buffer handling we
+    are not able to utilize memoryview
+    efficiently.
+
+    For this to happen we must store `buffer`
+    as List[memoryview] instead of raw bytes and
+    update parser to work accordingly.
+    """
 
     def __init__(self, parser_type: int) -> None:
         self.type: int = parser_type
         self.state: int = httpParserStates.INITIALIZED
 
+        self.host: Optional[bytes] = None
+        self.port: Optional[int] = None
+        self.path: Optional[bytes] = None
+        self.method: Optional[bytes] = None
+        self.code: Optional[bytes] = None
+        self.reason: Optional[bytes] = None
+        self.version: Optional[bytes] = None
+
         # Total size of raw bytes passed for parsing
         self.total_size: int = 0
 
         # Buffer to hold unprocessed bytes
         self.buffer: bytes = b''
 
+        # Keys are lower case header names
+        # Values are 2-tuple containing original
+        # header and it's value as received.
         self.headers: Dict[bytes, Tuple[bytes, bytes]] = {}
         self.body: Optional[bytes] = None
 
-        self.method: Optional[bytes] = None
-        self.url: Optional[urlparse.SplitResultBytes] = None
-        self.code: Optional[bytes] = None
-        self.reason: Optional[bytes] = None
-        self.version: Optional[bytes] = None
-
         self.chunk_parser: Optional[ChunkParser] = None
 
-        # This cleans up developer APIs as Python urlparse.urlsplit behaves differently
-        # for incoming proxy request and incoming web request.  Web request is the one
-        # which is broken.
-        self.host: Optional[bytes] = None
-        self.port: Optional[int] = None
-        self.path: Optional[bytes] = None
+        # TODO: Deprecate me, we don't need this in core.
+        #
+        # Deprecated since v2.4.0
+        #
+        # This is mostly for developers so that they can directly
+        # utilize a url object, but is unnecessary as parser
+        # provides all the necessary parsed information.
+        #
+        # But developers can utilize urlsplit or whatever
+        # library they are using when necessary. This will certainly
+        # give some performance boost as url parsing won't be needed
+        # for every request/response object.
+        #
+        # (except query string and fragments)
+        self._url: Optional[urlparse.SplitResultBytes] = None
 
     @classmethod
     def request(cls: Type[T], raw: bytes) -> T:
@@ -116,157 +140,51 @@ def set_url(self, url: bytes) -> None:
         # with urlsplit, which expects a fully qualified url.
         if self.method == httpMethods.CONNECT:
             url = b'https://' + url
-        self.url = urlparse.urlsplit(url)
-        self.set_line_attributes()
-
-    def set_line_attributes(self) -> None:
-        if self.type == httpParserTypes.REQUEST_PARSER:
-            if self.method == httpMethods.CONNECT and self.url:
-                self.host = self.url.hostname
-                self.port = 443 if self.url.port is None else self.url.port
-            elif self.url:
-                self.host, self.port = self.url.hostname, self.url.port \
-                    if self.url.port else DEFAULT_HTTP_PORT
-            else:
-                raise KeyError(
-                    'Invalid request. Method: %r, Url: %r' %
-                    (self.method, self.url),
-                )
-            self.path = self.build_path()
+        self._url = urlparse.urlsplit(url)
+        self._set_line_attributes()
 
     def is_chunked_encoded(self) -> bool:
         return b'transfer-encoding' in self.headers and \
                self.headers[b'transfer-encoding'][1].lower() == b'chunked'
 
+    def content_expected(self) -> bool:
+        return b'content-length' in self.headers and int(self.header(b'content-length')) > 0
+
     def body_expected(self) -> bool:
-        return (
-            b'content-length' in self.headers and
-            int(self.header(b'content-length')) > 0
-        ) or \
-            self.is_chunked_encoded()
+        return self.content_expected() or self.is_chunked_encoded()
 
     def parse(self, raw: bytes) -> None:
         """Parses Http request out of raw bytes.
 
-        Check HttpParser state after parse has successfully returned."""
+        Check for `HttpParser.state` after `parse` has successfully returned.
+        """
         self.total_size += len(raw)
         raw = self.buffer + raw
-        self.buffer = b''
-
-        more = len(raw) > 0
+        self.buffer, more = b'', len(raw) > 0
         while more and self.state != httpParserStates.COMPLETE:
-            if self.state in (
-                    httpParserStates.HEADERS_COMPLETE,
-                    httpParserStates.RCVING_BODY,
-            ):
-                if b'content-length' in self.headers:
-                    self.state = httpParserStates.RCVING_BODY
-                    if self.body is None:
-                        self.body = b''
-                    total_size = int(self.header(b'content-length'))
-                    received_size = len(self.body)
-                    self.body += raw[:total_size - received_size]
-                    if self.body and \
-                            len(self.body) == int(self.header(b'content-length')):
-                        self.state = httpParserStates.COMPLETE
-                    more, raw = len(raw) > 0, raw[total_size - received_size:]
-                elif self.is_chunked_encoded():
-                    if not self.chunk_parser:
-                        self.chunk_parser = ChunkParser()
-                    raw = self.chunk_parser.parse(raw)
-                    if self.chunk_parser.state == chunkParserStates.COMPLETE:
-                        self.body = self.chunk_parser.body
-                        self.state = httpParserStates.COMPLETE
-                    more = False
-                else:
-                    raise NotImplementedError(
-                        'Parser shouldn\'t have reached here. ' +
-                        'This can happen when content length header is missing but their is a body in the payload',
-                    )
-            else:
-                more, raw = self.process(raw)
+            # gte with HEADERS_COMPLETE also encapsulated RCVING_BODY state
+            more, raw = self._process_body(raw) \
+                if self.state >= httpParserStates.HEADERS_COMPLETE else \
+                self._process_line_and_headers(raw)
         self.buffer = raw
 
-    def process(self, raw: bytes) -> Tuple[bool, bytes]:
-        """Returns False when no CRLF could be found in received bytes."""
-        line, raw = find_http_line(raw)
-        if line is None:
-            return False, raw
-
-        if self.state == httpParserStates.INITIALIZED:
-            self.process_line(line)
-            self.state = httpParserStates.LINE_RCVD
-        elif self.state in (httpParserStates.LINE_RCVD, httpParserStates.RCVING_HEADERS):
-            if self.state == httpParserStates.LINE_RCVD:
-                # LINE_RCVD state is equivalent to RCVING_HEADERS
-                self.state = httpParserStates.RCVING_HEADERS
-            if line.strip() == b'':  # Blank line received.
-                self.state = httpParserStates.HEADERS_COMPLETE
-            else:
-                self.process_header(line)
-
-        # When server sends a response line without any header or body e.g.
-        # HTTP/1.1 200 Connection established\r\n\r\n
-        if self.state == httpParserStates.LINE_RCVD and \
-                self.type == httpParserTypes.RESPONSE_PARSER and \
-                raw == CRLF:
-            self.state = httpParserStates.COMPLETE
-        elif self.state == httpParserStates.HEADERS_COMPLETE and \
-                not self.body_expected() and \
-                raw == b'':
-            self.state = httpParserStates.COMPLETE
-
-        return len(raw) > 0, raw
-
-    def process_line(self, raw: bytes) -> None:
-        line = raw.split(WHITESPACE)
-        if self.type == httpParserTypes.REQUEST_PARSER:
-            self.method = line[0].upper()
-            self.set_url(line[1])
-            self.version = line[2]
-        else:
-            self.version = line[0]
-            self.code = line[1]
-            self.reason = WHITESPACE.join(line[2:])
-
-    def process_header(self, raw: bytes) -> None:
-        parts = raw.split(COLON)
-        key = parts[0].strip()
-        value = COLON.join(parts[1:]).strip()
-        self.add_headers([(key, value)])
-
-    def build_path(self) -> bytes:
-        if not self.url:
-            return b'/None'
-        url = self.url.path
-        if url == b'':
-            url = b'/'
-        if not self.url.query == b'':
-            url += b'?' + self.url.query
-        if not self.url.fragment == b'':
-            url += b'#' + self.url.fragment
-        return url
-
     def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool = False) -> bytes:
         """Rebuild the request object."""
         assert self.method and self.version and self.path and self.type == httpParserTypes.REQUEST_PARSER
         if disable_headers is None:
             disable_headers = DEFAULT_DISABLE_HEADERS
-        body: Optional[bytes] = ChunkParser.to_chunks(self.body) \
-            if self.is_chunked_encoded() and self.body else \
-            self.body
+        body: Optional[bytes] = self._get_body_or_chunks()
         path = self.path
         if for_proxy:
-            assert self.url and self.host and self.port and self.path
+            assert self._url and self.host and self.port and self.path
             path = (
-                self.url.scheme +
+                self._url.scheme +
                 COLON + SLASH + SLASH +
                 self.host +
                 COLON +
                 str(self.port).encode() +
                 self.path
             ) if self.method != httpMethods.CONNECT else (self.host + COLON + str(self.port).encode())
-
         return build_http_request(
             self.method, path, self.version,
             headers={} if not self.headers else {
@@ -278,16 +196,15 @@ def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool =
 
     def build_response(self) -> bytes:
         """Rebuild the response object."""
-        assert self.code and self.version and self.body and self.type == httpParserTypes.RESPONSE_PARSER
+        assert self.code and self.version and self.type == httpParserTypes.RESPONSE_PARSER
         return build_http_response(
             status_code=int(self.code),
             protocol_version=self.version,
             reason=self.reason,
             headers={} if not self.headers else {
                 self.headers[k][0]: self.headers[k][1] for k in self.headers
             },
-            body=self.body if not self.is_chunked_encoded(
-            ) else ChunkParser.to_chunks(self.body),
+            body=self._get_body_or_chunks(),
         )
 
     def has_host(self) -> bool:
@@ -305,3 +222,110 @@ def is_connection_upgrade(self) -> bool:
         return self.version == HTTP_1_1 and \
             self.has_header(b'Connection') and \
             self.has_header(b'Upgrade')
+
+    def _process_body(self, raw: bytes) -> Tuple[bool, bytes]:
+        if b'content-length' in self.headers:
+            self.state = httpParserStates.RCVING_BODY
+            if self.body is None:
+                self.body = b''
+            total_size = int(self.header(b'content-length'))
+            received_size = len(self.body)
+            self.body += raw[:total_size - received_size]
+            if self.body and \
+                    len(self.body) == int(self.header(b'content-length')):
+                self.state = httpParserStates.COMPLETE
+            more, raw = len(raw) > 0, raw[total_size - received_size:]
+        elif self.is_chunked_encoded():
+            if not self.chunk_parser:
+                self.chunk_parser = ChunkParser()
+            raw = self.chunk_parser.parse(raw)
+            if self.chunk_parser.state == chunkParserStates.COMPLETE:
+                self.body = self.chunk_parser.body
+                self.state = httpParserStates.COMPLETE
+            more = False
+        else:
+            raise NotImplementedError(
+                'Parser shouldn\'t have reached here. ' +
+                'This can happen when content length header is missing but their is a body in the payload',
+            )
+        return more, raw
+
+    def _process_line_and_headers(self, raw: bytes) -> Tuple[bool, bytes]:
+        """Returns False when no CRLF could be found in received bytes."""
+        line, raw = find_http_line(raw)
+        if line is None:
+            return False, raw
+
+        if self.state == httpParserStates.INITIALIZED:
+            self._process_line(line)
+            self.state = httpParserStates.LINE_RCVD
+        elif self.state in (httpParserStates.LINE_RCVD, httpParserStates.RCVING_HEADERS):
+            if self.state == httpParserStates.LINE_RCVD:
+                # LINE_RCVD state is equivalent to RCVING_HEADERS
+                self.state = httpParserStates.RCVING_HEADERS
+            if line.strip() == b'':  # Blank line received.
+                self.state = httpParserStates.HEADERS_COMPLETE
+            else:
+                self._process_header(line)
+
+        # When server sends a response line without any header or body e.g.
+        # HTTP/1.1 200 Connection established\r\n\r\n
+        if self.state == httpParserStates.LINE_RCVD and \
+                self.type == httpParserTypes.RESPONSE_PARSER and \
+                raw == CRLF:
+            self.state = httpParserStates.COMPLETE
+        elif self.state == httpParserStates.HEADERS_COMPLETE and \
+                not self.body_expected() and \
+                raw == b'':
+            self.state = httpParserStates.COMPLETE
+
+        return len(raw) > 0, raw
+
+    def _process_line(self, raw: bytes) -> None:
+        line = raw.split(WHITESPACE)
+        if self.type == httpParserTypes.REQUEST_PARSER:
+            self.method = line[0].upper()
+            self.set_url(line[1])
+            self.version = line[2]
+        else:
+            self.version = line[0]
+            self.code = line[1]
+            self.reason = WHITESPACE.join(line[2:])
+
+    def _process_header(self, raw: bytes) -> None:
+        parts = raw.split(COLON)
+        key = parts[0].strip()
+        value = COLON.join(parts[1:]).strip()
+        self.add_headers([(key, value)])
+
+    def _get_body_or_chunks(self) -> Optional[bytes]:
+        return ChunkParser.to_chunks(self.body) \
+            if self.body and self.is_chunked_encoded() else \
+            self.body
+
+    def _set_line_attributes(self) -> None:
+        if self.type == httpParserTypes.REQUEST_PARSER:
+            if self.method == httpMethods.CONNECT and self._url:
+                self.host = self._url.hostname
+                self.port = 443 if self._url.port is None else self._url.port
+            elif self._url:
+                self.host, self.port = self._url.hostname, self._url.port \
+                    if self._url.port else DEFAULT_HTTP_PORT
+            else:
+                raise KeyError(
+                    'Invalid request. Method: %r, Url: %r' %
+                    (self.method, self._url),
+                )
+            self.path = self._build_path()
+
+    def _build_path(self) -> bytes:
+        if not self._url:
+            return b'/None'
+        url = self._url.path
+        if url == b'':
+            url = b'/'
+        if not self._url.query == b'':
+            url += b'?' + self._url.query
+        if not self._url.fragment == b'':
+            url += b'#' + self._url.fragment
+        return url
diff --git a/proxy/plugin/modify_chunk_response.py b/proxy/plugin/modify_chunk_response.py
@@ -45,7 +45,9 @@ def handle_upstream_chunk(self, chunk: memoryview) -> memoryview:
         self.response.parse(chunk.tobytes())
         # If response is complete, modify and dispatch to client
         if self.response.state == httpParserStates.COMPLETE:
-            self.response.body = b'\n'.join(self.DEFAULT_CHUNKS) + b'\n'
+            # Avoid setting a body for responses where content is not expected
+            if self.response.content_expected():
+                self.response.body = b'\n'.join(self.DEFAULT_CHUNKS) + b'\n'
             self.client.queue(memoryview(self.response.build_response()))
         return memoryview(b'')
 
diff --git a/tests/http/test_http_parser.py b/tests/http/test_http_parser.py