@@ -13,11 +13,12 @@ class FolderLoader(BaseLoader):
13
13
"""
14
14
Load data from each file inside a folder
15
15
"""
16
- def __init__ (self , folder , test = None , max_depth = 0 , classmap = None , target_column = None , ** kwargs ):
16
+ def __init__ (self , folder , test = r'\.(txt|csv|tsv|TXT|CSV|TSV)$' , ignore = None , max_depth = 0 , classmap = None , target_column = None , ** kwargs ):
17
17
"""
18
18
Constructor
19
19
:param folder: str path to the folder to load files from
20
- :param test: str|callable filter for filenames (regex or callable)
20
+ :param test: str|callable only load files that pass the test (regex or callable)
21
+ :param ignore: str|callable ignore files that pass the test (regex or callable)
21
22
:param target_column: None|int|str if None, target is inferred from filename
22
23
:param max_depth: int how many level to traverse the folder tree
23
24
:param classmap: dict a mapping from class indices to class names
@@ -26,7 +27,7 @@ def __init__(self, folder, test=None, max_depth=0, classmap=None, target_column=
26
27
assert os .path .isdir (folder ), '%s MUST be a folder' % folder
27
28
28
29
self .root = os .path .abspath (folder )
29
- self .files = sorted (list (self .walk (self .root , max_depth , test = test )))
30
+ self .files = sorted (list (self .walk (self .root , max_depth , test = test , ignore = ignore )))
30
31
31
32
assert len (self .files ) > 0 , 'no file to read'
32
33
@@ -44,12 +45,13 @@ def __init__(self, folder, test=None, max_depth=0, classmap=None, target_column=
44
45
45
46
self .dataset = Dataset (X , y , columns = datasets [0 ].columns , classmap = classmap )
46
47
47
- def walk (self , folder , max_depth = - 1 , test = None , root = None ):
48
+ def walk (self , folder , max_depth = - 1 , test = None , ignore = None , root = None ):
48
49
"""
49
50
Recursively walk a directory
50
51
:param folder: str
51
52
:param max_depth: int
52
53
:param test: callable|regex
54
+ :param ignore: callable|regex
53
55
:param root: str
54
56
:return: Iterator
55
57
"""
@@ -58,29 +60,42 @@ def walk(self, folder, max_depth=-1, test=None, root=None):
58
60
if root is None :
59
61
root = folder
60
62
63
+ # make test a function
61
64
if isinstance (test , str ):
62
65
# filter is a regex
63
- regex = re .compile (test )
66
+ test_regex = re .compile (test )
64
67
65
68
def test (filename ):
66
- return regex .search (filename ) is not None
69
+ return test_regex .search (filename ) is not None
67
70
elif not callable (test ):
68
71
# dummy filter
69
72
def test (filename ):
70
73
return True
71
74
75
+ # make ignore a function
76
+ if isinstance (ignore , str ):
77
+ # ignore is a regex
78
+ ignore_regex = re .compile (ignore )
79
+
80
+ def ignore (filename ):
81
+ return ignore_regex .search (filename ) is not None
82
+ elif not callable (ignore ):
83
+ # dummy ignore
84
+ def test (filename ):
85
+ return False
86
+
72
87
for entry in scandir (folder ):
73
88
if entry .is_dir ():
74
89
queue .append (entry )
75
90
else :
76
- if test (entry .path [len (root ) + 1 :]):
91
+ if test (entry .path [len (root ) + 1 :]) and not ignore ( entry . path [ len ( root ) + 1 :]) :
77
92
yield entry .path
78
93
79
94
if max_depth == 0 :
80
95
return
81
96
82
97
for q in queue :
83
- for file in self .walk (q , max_depth = max_depth - 1 , test = test , root = root ):
98
+ for file in self .walk (q , max_depth = max_depth - 1 , test = test , ignore = ignore , root = root ):
84
99
yield file
85
100
86
101
def to_class_name (self , filename ):
0 commit comments