Skip to content

Commit 68ffc5a

Browse files
authored
inject study_type in EBI and improvements to current automatic processing pipeline (#3023)
* inject study_type in ebi and improvements to current automatic proecssing pipeline * addressing @ElDeveloper comments
1 parent d9275b7 commit 68ffc5a

File tree

3 files changed

+61
-22
lines changed

3 files changed

+61
-22
lines changed

qiita_ware/ebi.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,11 @@ def generate_study_xml(self):
356356
study_title = ET.SubElement(descriptor, 'STUDY_TITLE')
357357
study_title.text = escape(clean_whitespace(self.study_title))
358358

359+
# study type is deprecated and not displayed anywhere on EBI-ENA;
360+
# however it's required for submission so just injecting with Other
361+
ET.SubElement(
362+
descriptor, 'STUDY_TYPE', {'existing_study_type': 'Other'})
363+
359364
study_abstract = ET.SubElement(descriptor, 'STUDY_ABSTRACT')
360365
study_abstract.text = clean_whitespace(escape(self.study_abstract))
361366

qiita_ware/test/test_ebi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,6 +1314,7 @@ def test_parse_EBI_reply(self):
13141314
<STUDY_TITLE>
13151315
Identification of the Microbiomes for Cannabis Soils
13161316
</STUDY_TITLE>
1317+
<STUDY_TYPE existing_study_type="Other" />
13171318
<STUDY_ABSTRACT>
13181319
This is a preliminary study to examine the microbiota associated with \
13191320
the Cannabis plant. Soils samples from the bulk soil, soil associated with \

scripts/qiita-auto-processing

Lines changed: 55 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ user = User('[email protected]')
3333
# 'version': the version of the plugin,
3434
# 'cmd_name': the command we want to run,
3535
# 'input_name': the name of the input parameter of that command
36+
# 'ignore_parameters': list of parameters to ignore, for example: threads
3637
# 'parent_artifact_name': name of the parent output, input for this command
3738
# 'parameters_names': list of the names of the parameter sets we want to run
3839
# }
@@ -41,21 +42,24 @@ full_pipelines = [
4142
'data_type': ['Metagenomic'],
4243
'artifact_type': 'per_sample_FASTQ',
4344
'previous-step': None,
45+
'requirements': dict(),
4446
'steps': [
4547
{'previous-step': None,
4648
'plugin': 'qp-shogun',
4749
'version': '012020',
4850
'cmd_name': 'Atropos v1.1.24',
4951
'input_name': 'input',
52+
'ignore_parameters': ['Number of threads used'],
5053
'parent_artifact_name': None,
5154
'parameters_names': ['KAPA HyperPlus with iTru']},
5255
{'previous-step': 'Atropos v1.1.24',
5356
'plugin': 'qp-shogun',
54-
'version': '012020',
55-
'cmd_name': 'Shogun v1.0.7',
57+
'version': '072020',
58+
'cmd_name': 'Shogun v1.0.8',
5659
'input_name': 'input',
60+
'ignore_parameters': ['Number of threads'],
5761
'parent_artifact_name': 'Adapter trimmed files',
58-
'parameters_names': ['wol_bowtie2', 'rep94_bowtie2']}
62+
'parameters_names': ['wol_bowtie2', 'rep200_bowtie2']}
5963
]},
6064
{'name': 'Target Gene Processing',
6165
'data_type': ['16S', '18S', 'ITS'],
@@ -73,6 +77,7 @@ full_pipelines = [
7377
'version': '1.9.1',
7478
'cmd_name': 'Trimming',
7579
'input_name': 'input_data',
80+
'ignore_parameters': [],
7681
'parent_artifact_name': None,
7782
'parameters_names': ['90 base pairs',
7883
'100 base pairs',
@@ -83,13 +88,15 @@ full_pipelines = [
8388
'version': '1.9.1',
8489
'cmd_name': 'Pick closed-reference OTUs',
8590
'input_name': 'input_data',
91+
'ignore_parameters': [],
8692
'parent_artifact_name': 'Trimmed Demultiplexed',
8793
'parameters_names': ['Defaults - parallel']},
8894
{'previous-step': 'Trimming',
8995
'plugin': 'deblur',
9096
'version': '1.1.0',
9197
'cmd_name': 'Deblur',
9298
'input_name': 'Demultiplexed sequences',
99+
'ignore_parameters': [],
93100
'parent_artifact_name': 'Trimmed Demultiplexed',
94101
'parameters_names': ['Defaults']}
95102
]},
@@ -122,6 +129,22 @@ def _check_requirements(requirements, template):
122129
return satisfied
123130

124131

132+
def _check_parameters(jobs, cmd):
133+
params = [{k: str(v) for k, v in j.parameters.values.items()
134+
if k not in cmd['ignore_parameters']} for j in jobs]
135+
return params
136+
137+
138+
def _submit_workflows(artifact_process):
139+
for artifact in artifact_process:
140+
if artifact['workflow'] is None:
141+
continue
142+
# nodes will return in position [0] the first job created
143+
first_job = list(artifact['workflow'].graph.nodes())[0]
144+
if first_job.status == 'in_construction':
145+
artifact['workflow'].submit()
146+
147+
125148
# Step 1. Loop over the full_pipelines to process each step
126149
for pipeline in full_pipelines:
127150
# Step 2. From the steps generate the list of commands to add to the
@@ -149,6 +172,7 @@ for pipeline in full_pipelines:
149172
'previous-step': step['previous-step'],
150173
'parent_artifact_name': step['parent_artifact_name'],
151174
'input_name': step['input_name'],
175+
'ignore_parameters': step['ignore_parameters'],
152176
'parameters': parameters})
153177

154178
# Step 2. - for children. Get their commands. We currently only support
@@ -161,7 +185,9 @@ for pipeline in full_pipelines:
161185
if c['previous-step'] == commands[0]['command-name']]
162186

163187
# Step 3. Find all preparations/artifacts that we can add the pipeline
164-
artifacts_all = [a for study in Study.iter()
188+
# ... as a first pass we will only process study 10317 (AGP) ...
189+
# artifacts_all = [a for study in Study.iter()
190+
artifacts_all = [a for study in [Study(10317)]
165191
# loop over all artifacts of artifact_type with in study
166192
for a in study.artifacts(
167193
artifact_type=pipeline['artifact_type'])
@@ -172,7 +198,10 @@ for pipeline in full_pipelines:
172198
artifacts_compliant = []
173199
for a in artifacts_all:
174200
st = a.study.sample_template
175-
pt = a.prep_templates[0]
201+
pts = a.prep_templates
202+
if not pts:
203+
continue
204+
pt = pts[0]
176205

177206
# {'sandbox', 'awaiting_approval', 'private', 'public'}
178207
if a.visibility in ('sandbox', 'awaiting_approval'):
@@ -194,23 +223,29 @@ for pipeline in full_pipelines:
194223
# of Step 4 but for debugging it makes sense to separate
195224
artifact_process = []
196225
children_compliant = []
226+
cmd = commands[0]
197227
for a in artifacts_compliant:
198-
cmd = commands[0]
199228
# getting all jobs, includen hiddens, in case the job failed
200229
jobs = a.jobs(cmd=cmd['command'], show_hidden=True)
201-
params = [j.parameters.values for j in jobs]
230+
params = _check_parameters(jobs, cmd)
202231

203232
# checking that all required parameters of this command exist
204233
missing_parameters = []
205234
for p in cmd['parameters']:
206235
p = p['values']
207236
p.update({cmd['input_name']: str(a.id)})
208-
if p not in params:
237+
p_to_compare = p.copy()
238+
for k in cmd['ignore_parameters']:
239+
del p_to_compare[k]
240+
if p_to_compare not in params:
209241
missing_parameters.append(p)
210242
else:
211243
for c in a.children:
212-
if c.processing_parameters.values == p:
213-
children_compliant.append(c)
244+
cpp = c.processing_parameters
245+
if cpp.command.name == cmd['command-name']:
246+
cparams = _check_parameters([cpp], cmd)
247+
if cparams == p_to_compare:
248+
children_compliant.append(c)
214249
if missing_parameters:
215250
# note that we are building a dict for each artifact so we can
216251
# save the workflow id, useful for when we run this in a terminal
@@ -224,14 +259,18 @@ for pipeline in full_pipelines:
224259
for cmd_id, cmd in enumerate(children_cmds):
225260
# getting all jobs, includen hiddens, in case the job failed
226261
jobs = a.jobs(cmd=cmd['command'], show_hidden=True)
227-
params = [j.parameters.values for j in jobs]
262+
params = _check_parameters(jobs, cmd)
228263

229264
# checking that all required parameters of this command exist
230265
missing_parameters = []
231266
for p in cmd['parameters']:
232267
p = p['values']
233-
p.update({cmd['input_name']: str(c.id)})
234-
if p not in params:
268+
p.update({cmd['input_name']: str(a.id)})
269+
p_to_compare = p.copy()
270+
for k in cmd['ignore_parameters']:
271+
del p_to_compare[k]
272+
273+
if p_to_compare not in params:
235274
missing_parameters.append(p)
236275
if missing_parameters:
237276
artifact_process.append(
@@ -266,9 +305,9 @@ for pipeline in full_pipelines:
266305
# now we can add the rest of the parameters to the workflow for
267306
# the first command
268307
for params in artifact['missing_parameters'][1:]:
269-
params.update({cmd['input_name']: str(a.id)})
270308
job_params = Parameters.load(cmd['command'], values_dict=params)
271-
artifact['workflow'].add(job_params)
309+
artifact['workflow'].add(
310+
job_params, req_params={cmd['input_name']: str(a.id)})
272311

273312
for cmd in commands[cmd_id + 1:]:
274313
# get jobs from the workflow to which we can add this new command
@@ -286,10 +325,4 @@ for pipeline in full_pipelines:
286325
cmd['parent_artifact_name']: cmd['input_name']}})
287326

288327
# Step 7. submit the workflows!
289-
for artifact in artifact_process:
290-
if artifact['workflow'] is None:
291-
continue
292-
# nodes will return in position [0] the first job created
293-
first_job = list(artifact['workflow'].graph.nodes())[0]
294-
if first_job.status == 'in_construction':
295-
artifact['workflow'].submit()
328+
_submit_workflows(artifact_process)

0 commit comments

Comments
 (0)