35
35
FileType ,
36
36
detect_filetype ,
37
37
)
38
+ import nltk
38
39
39
40
unstructured_partition_pdf = None
40
41
unstructured_partition_docx = None
41
42
unstructured_partition_pptx = None
42
43
44
+ try :
45
+ nltk .data .find ("tokenizers/punkt.zip" )
46
+ nltk .data .find ("tokenizers/punkt_tab.zip" )
47
+ except LookupError :
48
+ nltk .download ("punkt" )
49
+ nltk .download ("punkt_tab" )
50
+
43
51
44
52
def optional_decode (contents : Union [str , bytes ]) -> str :
45
53
if isinstance (contents , bytes ):
@@ -162,6 +170,10 @@ def parse_records(
162
170
logger .warn (f"File { file .uri } cannot be parsed. Skipping it." )
163
171
else :
164
172
raise e
173
+ except Exception as e :
174
+ exception_str = str (e )
175
+ logger .error (f"File { file .uri } caused an error during parsing: { exception_str } ." )
176
+ raise e
165
177
166
178
def _read_file (
167
179
self ,
@@ -186,7 +198,7 @@ def _read_file(
186
198
remote_file ,
187
199
self ._get_file_type_error_message (filetype ),
188
200
)
189
- if filetype in {FileType .MD , filetype is FileType .TXT }:
201
+ if filetype in {FileType .MD , FileType .TXT }:
190
202
file_content : bytes = file_handle .read ()
191
203
decoded_content : str = optional_decode (file_content )
192
204
return decoded_content
@@ -418,7 +430,12 @@ def _render_markdown(self, elements: List[Any]) -> str:
418
430
419
431
def _convert_to_markdown (self , el : Dict [str , Any ]) -> str :
420
432
if dpath .get (el , "type" ) == "Title" :
421
- heading_str = "#" * (dpath .get (el , "metadata/category_depth" , default = 1 ) or 1 )
433
+ category_depth = dpath .get (el , "metadata/category_depth" , default = 1 ) or 1
434
+ if not isinstance (category_depth , int ):
435
+ category_depth = (
436
+ int (category_depth ) if isinstance (category_depth , (str , float )) else 1
437
+ )
438
+ heading_str = "#" * category_depth
422
439
return f"{ heading_str } { dpath .get (el , 'text' )} "
423
440
elif dpath .get (el , "type" ) == "ListItem" :
424
441
return f"- { dpath .get (el , 'text' )} "
0 commit comments