Skip to content

Commit 8b76357

Browse files
chore(deps): bump nltk from 3.8.1 to 3.9.1 fix unit tests for dependabot/pip/nltk-3.9 branch (#51)
1 parent 4cdb963 commit 8b76357

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ dist
1212
.mypy_cache
1313
.venv
1414
.pytest_cache
15+
.idea
1516
**/__pycache__

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,19 @@
3535
FileType,
3636
detect_filetype,
3737
)
38+
import nltk
3839

3940
unstructured_partition_pdf = None
4041
unstructured_partition_docx = None
4142
unstructured_partition_pptx = None
4243

44+
try:
45+
nltk.data.find("tokenizers/punkt.zip")
46+
nltk.data.find("tokenizers/punkt_tab.zip")
47+
except LookupError:
48+
nltk.download("punkt")
49+
nltk.download("punkt_tab")
50+
4351

4452
def optional_decode(contents: Union[str, bytes]) -> str:
4553
if isinstance(contents, bytes):
@@ -162,6 +170,10 @@ def parse_records(
162170
logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
163171
else:
164172
raise e
173+
except Exception as e:
174+
exception_str = str(e)
175+
logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
176+
raise e
165177

166178
def _read_file(
167179
self,
@@ -186,7 +198,7 @@ def _read_file(
186198
remote_file,
187199
self._get_file_type_error_message(filetype),
188200
)
189-
if filetype in {FileType.MD, filetype is FileType.TXT}:
201+
if filetype in {FileType.MD, FileType.TXT}:
190202
file_content: bytes = file_handle.read()
191203
decoded_content: str = optional_decode(file_content)
192204
return decoded_content
@@ -418,7 +430,12 @@ def _render_markdown(self, elements: List[Any]) -> str:
418430

419431
def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
420432
if dpath.get(el, "type") == "Title":
421-
heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1)
433+
category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
434+
if not isinstance(category_depth, int):
435+
category_depth = (
436+
int(category_depth) if isinstance(category_depth, (str, float)) else 1
437+
)
438+
heading_str = "#" * category_depth
422439
return f"{heading_str} {dpath.get(el, 'text')}"
423440
elif dpath.get(el, "type") == "ListItem":
424441
return f"- {dpath.get(el, 'text')}"

0 commit comments

Comments
 (0)