Skip to content

Commit 4cdb963

Browse files
committed
revert many changes
1 parent a431454 commit 4cdb963

File tree

1 file changed

+17
-13
lines changed

1 file changed

+17
-13
lines changed

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
import backoff
1111
import dpath
12-
from numpy import cast
1312
import requests
1413
from airbyte_cdk.models import FailureType
1514
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
@@ -30,6 +29,9 @@
3029
from airbyte_cdk.utils import is_cloud_environment
3130
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
3231
from unstructured.file_utils.filetype import (
32+
EXT_TO_FILETYPE,
33+
FILETYPE_TO_MIMETYPE,
34+
STR_TO_FILETYPE,
3335
FileType,
3436
detect_filetype,
3537
)
@@ -184,7 +186,6 @@ def _read_file(
184186
remote_file,
185187
self._get_file_type_error_message(filetype),
186188
)
187-
filetype = cast(FileType, filetype) # for mypy
188189
if filetype in {FileType.MD, filetype is FileType.TXT}:
189190
file_content: bytes = file_handle.read()
190191
decoded_content: str = optional_decode(file_content)
@@ -298,7 +299,7 @@ def _read_file_remotely(
298299

299300
data = self._params_to_dict(format.parameters, strategy)
300301

301-
file_data = {"files": ("filename", file_handle, filetype.mime_type)}
302+
file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
302303

303304
response = requests.post(
304305
f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
@@ -368,8 +369,8 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
368369
2. Use the file name if available
369370
3. Use the file content
370371
"""
371-
if remote_file.mime_type:
372-
return FileType.from_mime_type(remote_file.mime_type)
372+
if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE:
373+
return STR_TO_FILETYPE[remote_file.mime_type]
373374

374375
# set name to none, otherwise unstructured will try to get the modified date from the local file system
375376
if hasattr(file, "name"):
@@ -381,21 +382,26 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
381382
file_type: FileType | None = None
382383
try:
383384
file_type = detect_filetype(
384-
file_path=remote_file.uri,
385+
filename=remote_file.uri,
385386
)
386387
except Exception:
387388
# Path doesn't exist locally. Try something else...
388389
pass
389390

390-
if file_type is not None and not file_type == FileType.UNK:
391+
if file_type and file_type != FileType.UNK:
391392
return file_type
392393

393394
type_based_on_content = detect_filetype(file=file)
395+
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
394396

395-
# detect_filetype is reading to read the file content
396-
file.seek(0)
397+
if type_based_on_content and type_based_on_content != FileType.UNK:
398+
return type_based_on_content
397399

398-
return type_based_on_content
400+
extension = "." + remote_file.uri.split(".")[-1].lower()
401+
if extension in EXT_TO_FILETYPE:
402+
return EXT_TO_FILETYPE[extension]
403+
404+
return None
399405

400406
def _supported_file_types(self) -> List[Any]:
401407
return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
@@ -412,9 +418,7 @@ def _render_markdown(self, elements: List[Any]) -> str:
412418

413419
def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
414420
if dpath.get(el, "type") == "Title":
415-
heading_str = "#" * int(
416-
dpath.get(el, "metadata/category_depth", default=1) or 1,
417-
)
421+
heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1)
418422
return f"{heading_str} {dpath.get(el, 'text')}"
419423
elif dpath.get(el, "type") == "ListItem":
420424
return f"- {dpath.get(el, 'text')}"

0 commit comments

Comments
 (0)