Skip to content

Commit d9200fd

Browse files
authored
Response bodies can be empty or missing + HttpParser refactoring (#688)
* Responses can have None body, remove assertions, update modify chunk plugin to not modify chunks for responses with no content * Address mypy warning after removing assertion * Reusable get_body_or_chunks * Order methods by public/private, mark private ones with _ prefix * HttpParser.url deprecation notice (renamed to _url). Add zero-copy todo
1 parent 9e9ca90 commit d9200fd

File tree

3 files changed

+184
-158
lines changed

3 files changed

+184
-158
lines changed

proxy/http/parser.py

Lines changed: 162 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -43,35 +43,59 @@
4343

4444

4545
class HttpParser:
46-
"""HTTP request/response parser."""
46+
"""HTTP request/response parser.
47+
48+
TODO: Make me zero-copy by using memoryview.
49+
Currently due to chunk/buffer handling we
50+
are not able to utilize memoryview
51+
efficiently.
52+
53+
For this to happen we must store `buffer`
54+
as List[memoryview] instead of raw bytes and
55+
update parser to work accordingly.
56+
"""
4757

4858
def __init__(self, parser_type: int) -> None:
4959
self.type: int = parser_type
5060
self.state: int = httpParserStates.INITIALIZED
5161

62+
self.host: Optional[bytes] = None
63+
self.port: Optional[int] = None
64+
self.path: Optional[bytes] = None
65+
self.method: Optional[bytes] = None
66+
self.code: Optional[bytes] = None
67+
self.reason: Optional[bytes] = None
68+
self.version: Optional[bytes] = None
69+
5270
# Total size of raw bytes passed for parsing
5371
self.total_size: int = 0
5472

5573
# Buffer to hold unprocessed bytes
5674
self.buffer: bytes = b''
5775

76+
# Keys are lower case header names
77+
# Values are 2-tuple containing original
78+
# header and it's value as received.
5879
self.headers: Dict[bytes, Tuple[bytes, bytes]] = {}
5980
self.body: Optional[bytes] = None
6081

61-
self.method: Optional[bytes] = None
62-
self.url: Optional[urlparse.SplitResultBytes] = None
63-
self.code: Optional[bytes] = None
64-
self.reason: Optional[bytes] = None
65-
self.version: Optional[bytes] = None
66-
6782
self.chunk_parser: Optional[ChunkParser] = None
6883

69-
# This cleans up developer APIs as Python urlparse.urlsplit behaves differently
70-
# for incoming proxy request and incoming web request. Web request is the one
71-
# which is broken.
72-
self.host: Optional[bytes] = None
73-
self.port: Optional[int] = None
74-
self.path: Optional[bytes] = None
84+
# TODO: Deprecate me, we don't need this in core.
85+
#
86+
# Deprecated since v2.4.0
87+
#
88+
# This is mostly for developers so that they can directly
89+
# utilize a url object, but is unnecessary as parser
90+
# provides all the necessary parsed information.
91+
#
92+
# But developers can utilize urlsplit or whatever
93+
# library they are using when necessary. This will certainly
94+
# give some performance boost as url parsing won't be needed
95+
# for every request/response object.
96+
#
97+
# (except query string and fragments)
98+
self._url: Optional[urlparse.SplitResultBytes] = None
7599

76100
@classmethod
77101
def request(cls: Type[T], raw: bytes) -> T:
@@ -116,157 +140,51 @@ def set_url(self, url: bytes) -> None:
116140
# with urlsplit, which expects a fully qualified url.
117141
if self.method == httpMethods.CONNECT:
118142
url = b'https://' + url
119-
self.url = urlparse.urlsplit(url)
120-
self.set_line_attributes()
121-
122-
def set_line_attributes(self) -> None:
123-
if self.type == httpParserTypes.REQUEST_PARSER:
124-
if self.method == httpMethods.CONNECT and self.url:
125-
self.host = self.url.hostname
126-
self.port = 443 if self.url.port is None else self.url.port
127-
elif self.url:
128-
self.host, self.port = self.url.hostname, self.url.port \
129-
if self.url.port else DEFAULT_HTTP_PORT
130-
else:
131-
raise KeyError(
132-
'Invalid request. Method: %r, Url: %r' %
133-
(self.method, self.url),
134-
)
135-
self.path = self.build_path()
143+
self._url = urlparse.urlsplit(url)
144+
self._set_line_attributes()
136145

137146
def is_chunked_encoded(self) -> bool:
138147
return b'transfer-encoding' in self.headers and \
139148
self.headers[b'transfer-encoding'][1].lower() == b'chunked'
140149

150+
def content_expected(self) -> bool:
151+
return b'content-length' in self.headers and int(self.header(b'content-length')) > 0
152+
141153
def body_expected(self) -> bool:
142-
return (
143-
b'content-length' in self.headers and
144-
int(self.header(b'content-length')) > 0
145-
) or \
146-
self.is_chunked_encoded()
154+
return self.content_expected() or self.is_chunked_encoded()
147155

148156
def parse(self, raw: bytes) -> None:
149157
"""Parses Http request out of raw bytes.
150158
151-
Check HttpParser state after parse has successfully returned."""
159+
Check for `HttpParser.state` after `parse` has successfully returned.
160+
"""
152161
self.total_size += len(raw)
153162
raw = self.buffer + raw
154-
self.buffer = b''
155-
156-
more = len(raw) > 0
163+
self.buffer, more = b'', len(raw) > 0
157164
while more and self.state != httpParserStates.COMPLETE:
158-
if self.state in (
159-
httpParserStates.HEADERS_COMPLETE,
160-
httpParserStates.RCVING_BODY,
161-
):
162-
if b'content-length' in self.headers:
163-
self.state = httpParserStates.RCVING_BODY
164-
if self.body is None:
165-
self.body = b''
166-
total_size = int(self.header(b'content-length'))
167-
received_size = len(self.body)
168-
self.body += raw[:total_size - received_size]
169-
if self.body and \
170-
len(self.body) == int(self.header(b'content-length')):
171-
self.state = httpParserStates.COMPLETE
172-
more, raw = len(raw) > 0, raw[total_size - received_size:]
173-
elif self.is_chunked_encoded():
174-
if not self.chunk_parser:
175-
self.chunk_parser = ChunkParser()
176-
raw = self.chunk_parser.parse(raw)
177-
if self.chunk_parser.state == chunkParserStates.COMPLETE:
178-
self.body = self.chunk_parser.body
179-
self.state = httpParserStates.COMPLETE
180-
more = False
181-
else:
182-
raise NotImplementedError(
183-
'Parser shouldn\'t have reached here. ' +
184-
'This can happen when content length header is missing but their is a body in the payload',
185-
)
186-
else:
187-
more, raw = self.process(raw)
165+
# gte with HEADERS_COMPLETE also encapsulated RCVING_BODY state
166+
more, raw = self._process_body(raw) \
167+
if self.state >= httpParserStates.HEADERS_COMPLETE else \
168+
self._process_line_and_headers(raw)
188169
self.buffer = raw
189170

190-
def process(self, raw: bytes) -> Tuple[bool, bytes]:
191-
"""Returns False when no CRLF could be found in received bytes."""
192-
line, raw = find_http_line(raw)
193-
if line is None:
194-
return False, raw
195-
196-
if self.state == httpParserStates.INITIALIZED:
197-
self.process_line(line)
198-
self.state = httpParserStates.LINE_RCVD
199-
elif self.state in (httpParserStates.LINE_RCVD, httpParserStates.RCVING_HEADERS):
200-
if self.state == httpParserStates.LINE_RCVD:
201-
# LINE_RCVD state is equivalent to RCVING_HEADERS
202-
self.state = httpParserStates.RCVING_HEADERS
203-
if line.strip() == b'': # Blank line received.
204-
self.state = httpParserStates.HEADERS_COMPLETE
205-
else:
206-
self.process_header(line)
207-
208-
# When server sends a response line without any header or body e.g.
209-
# HTTP/1.1 200 Connection established\r\n\r\n
210-
if self.state == httpParserStates.LINE_RCVD and \
211-
self.type == httpParserTypes.RESPONSE_PARSER and \
212-
raw == CRLF:
213-
self.state = httpParserStates.COMPLETE
214-
elif self.state == httpParserStates.HEADERS_COMPLETE and \
215-
not self.body_expected() and \
216-
raw == b'':
217-
self.state = httpParserStates.COMPLETE
218-
219-
return len(raw) > 0, raw
220-
221-
def process_line(self, raw: bytes) -> None:
222-
line = raw.split(WHITESPACE)
223-
if self.type == httpParserTypes.REQUEST_PARSER:
224-
self.method = line[0].upper()
225-
self.set_url(line[1])
226-
self.version = line[2]
227-
else:
228-
self.version = line[0]
229-
self.code = line[1]
230-
self.reason = WHITESPACE.join(line[2:])
231-
232-
def process_header(self, raw: bytes) -> None:
233-
parts = raw.split(COLON)
234-
key = parts[0].strip()
235-
value = COLON.join(parts[1:]).strip()
236-
self.add_headers([(key, value)])
237-
238-
def build_path(self) -> bytes:
239-
if not self.url:
240-
return b'/None'
241-
url = self.url.path
242-
if url == b'':
243-
url = b'/'
244-
if not self.url.query == b'':
245-
url += b'?' + self.url.query
246-
if not self.url.fragment == b'':
247-
url += b'#' + self.url.fragment
248-
return url
249-
250171
def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool = False) -> bytes:
251172
"""Rebuild the request object."""
252173
assert self.method and self.version and self.path and self.type == httpParserTypes.REQUEST_PARSER
253174
if disable_headers is None:
254175
disable_headers = DEFAULT_DISABLE_HEADERS
255-
body: Optional[bytes] = ChunkParser.to_chunks(self.body) \
256-
if self.is_chunked_encoded() and self.body else \
257-
self.body
176+
body: Optional[bytes] = self._get_body_or_chunks()
258177
path = self.path
259178
if for_proxy:
260-
assert self.url and self.host and self.port and self.path
179+
assert self._url and self.host and self.port and self.path
261180
path = (
262-
self.url.scheme +
181+
self._url.scheme +
263182
COLON + SLASH + SLASH +
264183
self.host +
265184
COLON +
266185
str(self.port).encode() +
267186
self.path
268187
) if self.method != httpMethods.CONNECT else (self.host + COLON + str(self.port).encode())
269-
270188
return build_http_request(
271189
self.method, path, self.version,
272190
headers={} if not self.headers else {
@@ -278,16 +196,15 @@ def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool =
278196

279197
def build_response(self) -> bytes:
280198
"""Rebuild the response object."""
281-
assert self.code and self.version and self.body and self.type == httpParserTypes.RESPONSE_PARSER
199+
assert self.code and self.version and self.type == httpParserTypes.RESPONSE_PARSER
282200
return build_http_response(
283201
status_code=int(self.code),
284202
protocol_version=self.version,
285203
reason=self.reason,
286204
headers={} if not self.headers else {
287205
self.headers[k][0]: self.headers[k][1] for k in self.headers
288206
},
289-
body=self.body if not self.is_chunked_encoded(
290-
) else ChunkParser.to_chunks(self.body),
207+
body=self._get_body_or_chunks(),
291208
)
292209

293210
def has_host(self) -> bool:
@@ -305,3 +222,110 @@ def is_connection_upgrade(self) -> bool:
305222
return self.version == HTTP_1_1 and \
306223
self.has_header(b'Connection') and \
307224
self.has_header(b'Upgrade')
225+
226+
def _process_body(self, raw: bytes) -> Tuple[bool, bytes]:
227+
if b'content-length' in self.headers:
228+
self.state = httpParserStates.RCVING_BODY
229+
if self.body is None:
230+
self.body = b''
231+
total_size = int(self.header(b'content-length'))
232+
received_size = len(self.body)
233+
self.body += raw[:total_size - received_size]
234+
if self.body and \
235+
len(self.body) == int(self.header(b'content-length')):
236+
self.state = httpParserStates.COMPLETE
237+
more, raw = len(raw) > 0, raw[total_size - received_size:]
238+
elif self.is_chunked_encoded():
239+
if not self.chunk_parser:
240+
self.chunk_parser = ChunkParser()
241+
raw = self.chunk_parser.parse(raw)
242+
if self.chunk_parser.state == chunkParserStates.COMPLETE:
243+
self.body = self.chunk_parser.body
244+
self.state = httpParserStates.COMPLETE
245+
more = False
246+
else:
247+
raise NotImplementedError(
248+
'Parser shouldn\'t have reached here. ' +
249+
'This can happen when content length header is missing but their is a body in the payload',
250+
)
251+
return more, raw
252+
253+
def _process_line_and_headers(self, raw: bytes) -> Tuple[bool, bytes]:
254+
"""Returns False when no CRLF could be found in received bytes."""
255+
line, raw = find_http_line(raw)
256+
if line is None:
257+
return False, raw
258+
259+
if self.state == httpParserStates.INITIALIZED:
260+
self._process_line(line)
261+
self.state = httpParserStates.LINE_RCVD
262+
elif self.state in (httpParserStates.LINE_RCVD, httpParserStates.RCVING_HEADERS):
263+
if self.state == httpParserStates.LINE_RCVD:
264+
# LINE_RCVD state is equivalent to RCVING_HEADERS
265+
self.state = httpParserStates.RCVING_HEADERS
266+
if line.strip() == b'': # Blank line received.
267+
self.state = httpParserStates.HEADERS_COMPLETE
268+
else:
269+
self._process_header(line)
270+
271+
# When server sends a response line without any header or body e.g.
272+
# HTTP/1.1 200 Connection established\r\n\r\n
273+
if self.state == httpParserStates.LINE_RCVD and \
274+
self.type == httpParserTypes.RESPONSE_PARSER and \
275+
raw == CRLF:
276+
self.state = httpParserStates.COMPLETE
277+
elif self.state == httpParserStates.HEADERS_COMPLETE and \
278+
not self.body_expected() and \
279+
raw == b'':
280+
self.state = httpParserStates.COMPLETE
281+
282+
return len(raw) > 0, raw
283+
284+
def _process_line(self, raw: bytes) -> None:
285+
line = raw.split(WHITESPACE)
286+
if self.type == httpParserTypes.REQUEST_PARSER:
287+
self.method = line[0].upper()
288+
self.set_url(line[1])
289+
self.version = line[2]
290+
else:
291+
self.version = line[0]
292+
self.code = line[1]
293+
self.reason = WHITESPACE.join(line[2:])
294+
295+
def _process_header(self, raw: bytes) -> None:
296+
parts = raw.split(COLON)
297+
key = parts[0].strip()
298+
value = COLON.join(parts[1:]).strip()
299+
self.add_headers([(key, value)])
300+
301+
def _get_body_or_chunks(self) -> Optional[bytes]:
302+
return ChunkParser.to_chunks(self.body) \
303+
if self.body and self.is_chunked_encoded() else \
304+
self.body
305+
306+
def _set_line_attributes(self) -> None:
307+
if self.type == httpParserTypes.REQUEST_PARSER:
308+
if self.method == httpMethods.CONNECT and self._url:
309+
self.host = self._url.hostname
310+
self.port = 443 if self._url.port is None else self._url.port
311+
elif self._url:
312+
self.host, self.port = self._url.hostname, self._url.port \
313+
if self._url.port else DEFAULT_HTTP_PORT
314+
else:
315+
raise KeyError(
316+
'Invalid request. Method: %r, Url: %r' %
317+
(self.method, self._url),
318+
)
319+
self.path = self._build_path()
320+
321+
def _build_path(self) -> bytes:
322+
if not self._url:
323+
return b'/None'
324+
url = self._url.path
325+
if url == b'':
326+
url = b'/'
327+
if not self._url.query == b'':
328+
url += b'?' + self._url.query
329+
if not self._url.fragment == b'':
330+
url += b'#' + self._url.fragment
331+
return url

proxy/plugin/modify_chunk_response.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ def handle_upstream_chunk(self, chunk: memoryview) -> memoryview:
4545
self.response.parse(chunk.tobytes())
4646
# If response is complete, modify and dispatch to client
4747
if self.response.state == httpParserStates.COMPLETE:
48-
self.response.body = b'\n'.join(self.DEFAULT_CHUNKS) + b'\n'
48+
# Avoid setting a body for responses where content is not expected
49+
if self.response.content_expected():
50+
self.response.body = b'\n'.join(self.DEFAULT_CHUNKS) + b'\n'
4951
self.client.queue(memoryview(self.response.build_response()))
5052
return memoryview(b'')
5153

0 commit comments

Comments
 (0)