9
9
10
10
import backoff
11
11
import dpath
12
- from numpy import cast
13
12
import requests
14
13
from airbyte_cdk .models import FailureType
15
14
from airbyte_cdk .sources .file_based .config .file_based_stream_config import FileBasedStreamConfig
30
29
from airbyte_cdk .utils import is_cloud_environment
31
30
from airbyte_cdk .utils .traced_exception import AirbyteTracedException
32
31
from unstructured .file_utils .filetype import (
32
+ EXT_TO_FILETYPE ,
33
+ FILETYPE_TO_MIMETYPE ,
34
+ STR_TO_FILETYPE ,
33
35
FileType ,
34
36
detect_filetype ,
35
37
)
@@ -184,7 +186,6 @@ def _read_file(
184
186
remote_file ,
185
187
self ._get_file_type_error_message (filetype ),
186
188
)
187
- filetype = cast (FileType , filetype ) # for mypy
188
189
if filetype in {FileType .MD , filetype is FileType .TXT }:
189
190
file_content : bytes = file_handle .read ()
190
191
decoded_content : str = optional_decode (file_content )
@@ -298,7 +299,7 @@ def _read_file_remotely(
298
299
299
300
data = self ._params_to_dict (format .parameters , strategy )
300
301
301
- file_data = {"files" : ("filename" , file_handle , filetype . mime_type )}
302
+ file_data = {"files" : ("filename" , file_handle , FILETYPE_TO_MIMETYPE [ filetype ] )}
302
303
303
304
response = requests .post (
304
305
f"{ format .api_url } /general/v0/general" , headers = headers , data = data , files = file_data
@@ -368,8 +369,8 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
368
369
2. Use the file name if available
369
370
3. Use the file content
370
371
"""
371
- if remote_file .mime_type :
372
- return FileType . from_mime_type ( remote_file .mime_type )
372
+ if remote_file .mime_type and remote_file . mime_type in STR_TO_FILETYPE :
373
+ return STR_TO_FILETYPE [ remote_file .mime_type ]
373
374
374
375
# set name to none, otherwise unstructured will try to get the modified date from the local file system
375
376
if hasattr (file , "name" ):
@@ -381,21 +382,26 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
381
382
file_type : FileType | None = None
382
383
try :
383
384
file_type = detect_filetype (
384
- file_path = remote_file .uri ,
385
+ filename = remote_file .uri ,
385
386
)
386
387
except Exception :
387
388
# Path doesn't exist locally. Try something else...
388
389
pass
389
390
390
- if file_type is not None and not file_type = = FileType .UNK :
391
+ if file_type and file_type ! = FileType .UNK :
391
392
return file_type
392
393
393
394
type_based_on_content = detect_filetype (file = file )
395
+ file .seek (0 ) # detect_filetype is reading to read the file content, so we need to reset
394
396
395
- # detect_filetype is reading to read the file content
396
- file . seek ( 0 )
397
+ if type_based_on_content and type_based_on_content != FileType . UNK :
398
+ return type_based_on_content
397
399
398
- return type_based_on_content
400
+ extension = "." + remote_file .uri .split ("." )[- 1 ].lower ()
401
+ if extension in EXT_TO_FILETYPE :
402
+ return EXT_TO_FILETYPE [extension ]
403
+
404
+ return None
399
405
400
406
def _supported_file_types (self ) -> List [Any ]:
401
407
return [FileType .MD , FileType .PDF , FileType .DOCX , FileType .PPTX , FileType .TXT ]
@@ -412,9 +418,7 @@ def _render_markdown(self, elements: List[Any]) -> str:
412
418
413
419
def _convert_to_markdown (self , el : Dict [str , Any ]) -> str :
414
420
if dpath .get (el , "type" ) == "Title" :
415
- heading_str = "#" * int (
416
- dpath .get (el , "metadata/category_depth" , default = 1 ) or 1 ,
417
- )
421
+ heading_str = "#" * (dpath .get (el , "metadata/category_depth" , default = 1 ) or 1 )
418
422
return f"{ heading_str } { dpath .get (el , 'text' )} "
419
423
elif dpath .get (el , "type" ) == "ListItem" :
420
424
return f"- { dpath .get (el , 'text' )} "
0 commit comments