Skip to content

Commit aa7178d

Browse files
Merge branch 'master' into feature/ivyleavedtoadflax/multitask_2
2 parents 2d56411 + fbc37d1 commit aa7178d

File tree

9 files changed

+1973
-514
lines changed

9 files changed

+1973
-514
lines changed

deep_reference_parser/io/io.py

+51-12
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,53 @@
1313

1414
from ..logger import logger
1515

16-
def _split_list_by_linebreaks(tokens):
16+
def _unpack(tuples):
17+
"""Convert list of tuples into the correct format:
18+
19+
From:
20+
21+
[
22+
(
23+
(token0, token1, token2, token3),
24+
(label0, label1, label2, label3),
25+
),
26+
(
27+
(token0, token1, token2),
28+
(label0, label1, label2),
29+
),
30+
)
31+
32+
to:
33+
]
34+
(
35+
(token0, token1, token2, token3),
36+
(token0, token1, token2),
37+
),
38+
(
39+
(label0, label1, label2, label3),
40+
(label0, label1, label2),
41+
),
42+
]
43+
"""
44+
return list(zip(*list(tuples)))
45+
46+
def _split_list_by_linebreaks(rows):
1747
"""Cycle through a list of tokens (or labels) and split them into lists
1848
based on the presence of Nones or more likely math.nan caused by converting
1949
pd.DataFrame columns to lists.
2050
"""
2151
out = []
22-
tokens_gen = iter(tokens)
52+
rows_gen = iter(rows)
2353
while True:
2454
try:
25-
token = next(tokens_gen)
26-
if isinstance(token, str) and token:
27-
out.append(token)
55+
row = next(rows_gen)
56+
token = row[0]
57+
# Check whether there are missing labels that have been converted
58+
# to float('nan')
59+
if isinstance(token, str) and any([not isinstance(label, str) for label in row]):
60+
pass
61+
elif isinstance(token, str) and token:
62+
out.append(row)
2863
else:
2964
yield out
3065
out = []
@@ -40,10 +75,8 @@ def load_tsv(filepath, split_char="\t"):
4075
Expects data in the following format (tab separations).
4176
4277
References o o
43-
o o
4478
1 o o
4579
. o o
46-
o o
4780
WHO title b-r
4881
treatment title i-r
4982
guidelines title i-r
@@ -55,21 +88,27 @@ def load_tsv(filepath, split_char="\t"):
5588
, title i-r
5689
2016 title i-r
5790
58-
59-
6091
Args:
6192
filepath (str): Path to the data.
6293
split_char(str): Character to be used to split each line of the
6394
document.
6495
6596
Returns:
66-
a series of lists depending on the number of label columns provided in
97+
a series of lists depending on the number of label columns provided in
6798
filepath.
6899
69100
"""
70-
71101
df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
72-
out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()]
102+
103+
tuples = _split_list_by_linebreaks(df.to_records(index=False))
104+
105+
# Remove leading empty lists if found
106+
107+
tuples = list(filter(None, tuples))
108+
109+
unpacked_tuples = list(map(_unpack, tuples))
110+
111+
out = _unpack(unpacked_tuples)
73112

74113
logger.info("Loaded %s training examples", len(out[0]))
75114

0 commit comments

Comments
 (0)