13
13
14
14
from ..logger import logger
15
15
16
- def _split_list_by_linebreaks (tokens ):
16
+ def _unpack (tuples ):
17
+ """Convert list of tuples into the correct format:
18
+
19
+ From:
20
+
21
+ [
22
+ (
23
+ (token0, token1, token2, token3),
24
+ (label0, label1, label2, label3),
25
+ ),
26
+ (
27
+ (token0, token1, token2),
28
+ (label0, label1, label2),
29
+ ),
30
+ )
31
+
32
+ to:
33
+ ]
34
+ (
35
+ (token0, token1, token2, token3),
36
+ (token0, token1, token2),
37
+ ),
38
+ (
39
+ (label0, label1, label2, label3),
40
+ (label0, label1, label2),
41
+ ),
42
+ ]
43
+ """
44
+ return list (zip (* list (tuples )))
45
+
46
+ def _split_list_by_linebreaks (rows ):
17
47
"""Cycle through a list of tokens (or labels) and split them into lists
18
48
based on the presence of Nones or more likely math.nan caused by converting
19
49
pd.DataFrame columns to lists.
20
50
"""
21
51
out = []
22
- tokens_gen = iter (tokens )
52
+ rows_gen = iter (rows )
23
53
while True :
24
54
try :
25
- token = next (tokens_gen )
26
- if isinstance (token , str ) and token :
27
- out .append (token )
55
+ row = next (rows_gen )
56
+ token = row [0 ]
57
+ # Check whether there are missing labels that have been converted
58
+ # to float('nan')
59
+ if isinstance (token , str ) and any ([not isinstance (label , str ) for label in row ]):
60
+ pass
61
+ elif isinstance (token , str ) and token :
62
+ out .append (row )
28
63
else :
29
64
yield out
30
65
out = []
@@ -40,10 +75,8 @@ def load_tsv(filepath, split_char="\t"):
40
75
Expects data in the following format (tab separations).
41
76
42
77
References o o
43
- o o
44
78
1 o o
45
79
. o o
46
- o o
47
80
WHO title b-r
48
81
treatment title i-r
49
82
guidelines title i-r
@@ -55,21 +88,27 @@ def load_tsv(filepath, split_char="\t"):
55
88
, title i-r
56
89
2016 title i-r
57
90
58
-
59
-
60
91
Args:
61
92
filepath (str): Path to the data.
62
93
split_char(str): Character to be used to split each line of the
63
94
document.
64
95
65
96
Returns:
66
- a series of lists depending on the number of label columns provided in
97
+ a series of lists depending on the number of label columns provided in
67
98
filepath.
68
99
69
100
"""
70
-
71
101
df = pd .read_csv (filepath , delimiter = split_char , header = None , skip_blank_lines = False )
72
- out = [list (_split_list_by_linebreaks (column )) for _ , column in df .iteritems ()]
102
+
103
+ tuples = _split_list_by_linebreaks (df .to_records (index = False ))
104
+
105
+ # Remove leading empty lists if found
106
+
107
+ tuples = list (filter (None , tuples ))
108
+
109
+ unpacked_tuples = list (map (_unpack , tuples ))
110
+
111
+ out = _unpack (unpacked_tuples )
73
112
74
113
logger .info ("Loaded %s training examples" , len (out [0 ]))
75
114
0 commit comments