Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions docs/content/notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ Input files
===========

A couple of things need to be kept in mind with regards to methylation data.
At this point, four different file types for methylation data are supported:
At this point, five different file types for methylation data are supported:
- Allcools files
- MethylDackel bedgraph files
- Bismark coverage files
- Bismark CpG report files
- BedMethyl files

Note that both MethylDackel bedgraph files and Bismark CpG report files are assumed to be 0-based encoded.
Note that MethylDackel bedgraph files, Bismark CpG report files and BedMethyl files are assumed to be 0-based start encoded (and 1-based end, if applicable).
The Allcools files and Bismark coverage files are assumed to be 1-based encoded.
For the Bismark coverage files, keep in mind that `bismark_methylation_extractor` has a flag to output 0-based files, so pay attention that this is correct.
For BedMethyl files, please note that _no_ checks are performed to ensure that only one modification type is present.
If you have multiple modification types (i.e. more then one 'name' in column 4), please split htem into separate files (you can include them in the output by specific multiple methylation_path/methylation_pattern combinations).

For the RNA-part, for now only featureCounts tables are supported.
47 changes: 47 additions & 0 deletions python/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,53 @@ def bismarkrep_path(tmp_path):
f.write('chr1\t230\t+\t0\t100\tCG\tCGN\n')
return methpath


@pytest.fixture
def bedmethyl_path(tmp_path):
'''
simulate bedmethyl files.
tsv with chrom, start, end, name, score strand,
thickstart, thickend, color, validcov, %mod, count_mod
count_can, count_other, count_delete, count_fail, count_diff, count_nocall
columns.
these are 0-based.
'''
methpath = tmp_path / "bedmethyl"
methpath.mkdir()

# Cell 1 - avg. meth in gene 1 = 50%
with gzip.open(methpath / "cell1.WCGN.bed.gz", "wt") as f:
f.write('chr1\t20\t21\tm\t100\t+\t20\t21\t255,0,0\t100\t25.00\t25\t75\t0\t0\t0\t0\t0\n')
f.write('chr1\t50\t51\tm\t100\t+\t50\t51\t255,0,0\t100\t75.00\t75\t25\t0\t0\t0\t0\t0\n')

# Cell 2 - avg. meth in gene 2 = 62.5%
with gzip.open(methpath / "cell2.WCGN.bed.gz", "wt") as f:
f.write('chr1\t221\t222\tm\t4\t+\t221\t222\t255,0,0\t4\t50.00\t2\t2\t0\t0\t0\t0\t0\n')
f.write('chr1\t244\t245\tm\t16\t+\t224\t225\t255,0,0\t16\t75.00\t12\t4\t0\t0\t0\t0\t0\n')

# Cell 3 - avg. meth in gene 1 = 100%, avg. meth in gene 2 = 0%
with gzip.open(methpath / "cell3.WCGN.bed.gz", "wt") as f:
f.write('chr1\t20\t21\tm\t21\t+\t20\t21\t255,0,0\t21\t100.00\t21\t0\t0\t0\t0\t0\t0\n')
f.write('chr1\t50\t51\tm\t35\t+\t50\t51\t255,0,0\t35\t100.00\t35\t0\t0\t0\t0\t0\t0\n')
f.write('chr1\t221\t222\tm\t100\t+\t221\t222\t255,0,0\t100\t0.00\t0\t100\t0\t0\t0\t0\t0\n')

# Cell 1 - avg. meth in gene 1 = 25%
with gzip.open(methpath / "cell1.GCHN.bed.gz", "wt") as f:
f.write('chr1\t30\t31\tm\t100\t+\t30\t31\t255,0,0\t100\t25.00\t25\t75\t0\t0\t0\t0\t0\n')
f.write('chr1\t40\t41\tm\t4\t+\t40\t41\t255,0,0\t4\t25.00\t1\t3\t0\t0\t0\t0\t0\n')

# Cell 2 - avg. meth in gene 2 = 44%
with gzip.open(methpath / "cell2.GCHN.bed.gz", "wt") as f:
f.write('chr1\t230\t231\tm\t100\t+\t230\t231\t255,0,0\t100\t44.00\t44\t56\t0\t0\t0\t0\t0\n')

# Cell 3 - avg. meth in gene 1 = 100%, avg. meth in gene 2 = 0%
with gzip.open(methpath / "cell3.GCHN.bed.gz", "wt") as f:
f.write('chr1\t30\t31\tm\t100\t+\t30\t31\t255,0,0\t100\t100.00\t100\t0\t0\t0\t0\t0\t0\n')
f.write('chr1\t40\t41\tm\t4\t+\t40\t41\t255,0,0\t4\t100.00\t4\t0\t0\t0\t0\t0\t0\n')
f.write('chr1\t230\t231\tm\t100\t+\t230\t231\t255,0,0\t100\t0.00\t0\t100\t0\t0\t0\t0\t0\n')
return methpath


@pytest.fixture
def rna_path(tmp_path):
'''
Expand Down
1 change: 1 addition & 0 deletions python/tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class TestAllcools:
("methyldackel_path", ('*WCGN.bedgraph', '*GCHN.bedgraph')),
("bismarkcov_path", ('*WCGN.bismark.cov', '*GCHN.bismark.cov')),
("bismarkrep_path", ('*WCGN.bismark.rep.gz', '*GCHN.bismark.rep.gz')),
("bedmethyl_path", ('*WCGN*bed.gz', '*GCHN*bed.gz')),
],
indirect=["dynamic_methylation_path"]
)
Expand Down
13 changes: 10 additions & 3 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,6 @@ pub fn decide_methtype(line: Option<String>) -> MethFileType {
if let Some(l) = line {
let fields: Vec<&str> = l.split('\t').collect();
match fields.len() {
// 4 => {
// Ok(MethFileType::BismarkBedgraph)
// },
6 => {
// Either MethylDackel bedgraph or BismarkCov
// The difference is small:
Expand Down Expand Up @@ -202,6 +199,16 @@ pub fn decide_methtype(line: Option<String>) -> MethFileType {
MethFileType::BismarkCpGReport
}
},
18 => {
// BedMethyl file
let meth = fields[11].parse::<u32>();
let cov = fields[9].parse::<u32>();
if meth.is_ok() && cov.is_ok() {
MethFileType::BedMethyl
} else {
panic!("Could not parse BedMethyl coverage and modification counts (columns 9/11) from: {}", l);
}
}
_ => panic!("Could not decide methylation filetype, as it has {} columns.", fields.len())
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/tests/data/methf_allcools
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr1 1 + CG 1 1 1
chr1 3 - CG 0 1 1
chr1 3 - CG 0 1 1
3 changes: 3 additions & 0 deletions src/tests/data/methf_bedmethyl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#chrom chromStart chromEnd name score strand thickStart thickEnd color valid_coverage percent_modified count_modified count_canonical count_other_mod count_delete count_fail count_diff count_nocall
chr1 0 1 m 1 + 0 1 255,0,0 1 100.00 1 0 0 0 0 0 0
chr1 2 3 m 1 + 2 3 255,0,0 1 00.00 0 1 0 0 0 0 0
2 changes: 1 addition & 1 deletion src/tests/data/methf_bismarkcov
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr1 1 2 100 1 0
chr1 3 4 0 0 1
chr1 3 4 0 0 1
2 changes: 1 addition & 1 deletion src/tests/data/methf_cpgrep
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr1 0 + 1 0 CG CGH
chr1 2 - 0 1 CG CGH
chr1 2 - 0 1 CG CGH
2 changes: 1 addition & 1 deletion src/tests/data/methf_methyldackel
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chr1 0 1 100 1 1
chr1 2 3 0 0 1
chr1 2 3 0 0 1
2 changes: 1 addition & 1 deletion src/tests/data/region.bed
Original file line number Diff line number Diff line change
@@ -1 +1 @@
chr1 100 200
chr1 100 200
5 changes: 4 additions & 1 deletion src/tests/test_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@ mod tests {
let bismarkcov = test_path.parent().unwrap().join("data/methf_bismarkcov");
let cpgrep = test_path.parent().unwrap().join("data/methf_cpgrep");
let methyldackel = test_path.parent().unwrap().join("data/methf_methyldackel");
let bedmethyl = test_path.parent().unwrap().join("data/methf_bedmethyl");
let pairs: Vec<(&str, MethFileType)> = vec![
(allcoolsf.to_str().unwrap(), MethFileType::AllCools),
(bismarkcov.to_str().unwrap(), MethFileType::BismarkCov),
(cpgrep.to_str().unwrap(), MethFileType::BismarkCpGReport),
(methyldackel.to_str().unwrap(), MethFileType::MethylDackel),
(bedmethyl.to_str().unwrap(), MethFileType::BedMethyl),
];
for (f, expected) in pairs {
let reader = std::io::BufReader::new(std::fs::File::open(f).unwrap());
Expand All @@ -74,7 +76,7 @@ mod tests {
let bismarkcov = test_path.parent().unwrap().join("data/methf_bismarkcov");
let cpgrep = test_path.parent().unwrap().join("data/methf_cpgrep");
let methyldackel = test_path.parent().unwrap().join("data/methf_methyldackel");

let bedmethyl = test_path.parent().unwrap().join("data/methf_bedmethyl");
let exp_mr = vec![
MethRegion { chrom: "chr1".to_string(), pos: 0, meth: 1, total: 1 },
MethRegion { chrom: "chr1".to_string(), pos: 2, meth: 0, total: 1 },
Expand All @@ -85,6 +87,7 @@ mod tests {
bismarkcov.to_str().unwrap(),
cpgrep.to_str().unwrap(),
methyldackel.to_str().unwrap(),
bedmethyl.to_str().unwrap(),
] {
let methregions = read_meth(f);
assert_eq!(methregions, exp_mr);
Expand Down
18 changes: 17 additions & 1 deletion src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,17 @@ pub struct MethRegion {
pub enum MethFileType {
AllCools,
MethylDackel,
//BismarkBedgraph,
BedMethyl,
BismarkCov,
BismarkCpGReport,
}

impl MethFileType {
pub fn parse_line(&self, line: &str) -> Result<Option<MethRegion>, String> {
// Ignore any comment lines (#)
if line.starts_with('#') {
return Ok(None);
};
match self {
MethFileType::AllCools => {
let fields: Vec<&str> = line.split('\t').collect();
Expand Down Expand Up @@ -81,6 +85,18 @@ impl MethFileType {
Ok(None)
}
}
MethFileType::BedMethyl => {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() != 18 {
return Err(format!("Invalid BedMethyl line: {}", line));
}
let chrom = fields[0].to_string();
// BedMethyl is 0-based.
let pos = fields[1].parse::<u32>().map_err(|e| format!("Invalid position in BedMethyl line: {}: {}", line, e))?;
let meth = fields[11].parse::<u32>().map_err(|e| format!("Invalid methylated count in BedMethyl line: {}: {}", line, e))?;
let total = fields[9].parse::<u32>().map_err(|e| format!("Invalid unmethylated count in BedMethyl line: {}: {}", line, e))?;
Ok(Some(MethRegion { chrom, pos, meth, total }))
}
}
}
}
Loading