From 6343b99e2ee884bd4a284749f0cb3c31f0efb222 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 19 Sep 2025 13:14:04 +0200 Subject: [PATCH 1/3] support for bedmethyl files --- src/reader.rs | 13 ++++++++++--- src/tests/data/methf_allcools | 2 +- src/tests/data/methf_bedmethyl | 3 +++ src/tests/data/methf_bismarkcov | 2 +- src/tests/data/methf_cpgrep | 2 +- src/tests/data/methf_methyldackel | 2 +- src/tests/data/region.bed | 2 +- src/tests/test_reader.rs | 5 ++++- src/types.rs | 18 +++++++++++++++++- 9 files changed, 39 insertions(+), 10 deletions(-) create mode 100644 src/tests/data/methf_bedmethyl diff --git a/src/reader.rs b/src/reader.rs index 2c18c54..e4a04ef 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -167,9 +167,6 @@ pub fn decide_methtype(line: Option) -> MethFileType { if let Some(l) = line { let fields: Vec<&str> = l.split('\t').collect(); match fields.len() { - // 4 => { - // Ok(MethFileType::BismarkBedgraph) - // }, 6 => { // Either MethylDackel bedgraph or BismarkCov // The difference is small: @@ -202,6 +199,16 @@ pub fn decide_methtype(line: Option) -> MethFileType { MethFileType::BismarkCpGReport } }, + 18 => { + // BedMethyl file + let meth = fields[11].parse::(); + let cov = fields[9].parse::(); + if meth.is_ok() && cov.is_ok() { + MethFileType::BedMethyl + } else { + panic!("Could not parse BedMethyl coverage and modification counts (columns 9/11) from: {}", l); + } + } _ => panic!("Could not decide methylation filetype, as it has {} columns.", fields.len()) } } diff --git a/src/tests/data/methf_allcools b/src/tests/data/methf_allcools index 8bfd0af..c48ce82 100644 --- a/src/tests/data/methf_allcools +++ b/src/tests/data/methf_allcools @@ -1,2 +1,2 @@ chr1 1 + CG 1 1 1 -chr1 3 - CG 0 1 1 +chr1 3 - CG 0 1 1 \ No newline at end of file diff --git a/src/tests/data/methf_bedmethyl b/src/tests/data/methf_bedmethyl new file mode 100644 index 0000000..fac9d54 --- /dev/null +++ b/src/tests/data/methf_bedmethyl @@ -0,0 +1,3 @@ +#chrom chromStart chromEnd name score strand thickStart thickEnd color valid_coverage percent_modified count_modified count_canonical count_other_mod count_delete count_fail count_diff count_nocall +chr1 0 1 m 1 + 0 1 255,0,0 1 100.00 1 0 0 0 0 0 0 +chr1 2 3 m 1 + 2 3 255,0,0 1 00.00 0 1 0 0 0 0 0 diff --git a/src/tests/data/methf_bismarkcov b/src/tests/data/methf_bismarkcov index 6fe5e8d..3e28601 100644 --- a/src/tests/data/methf_bismarkcov +++ b/src/tests/data/methf_bismarkcov @@ -1,2 +1,2 @@ chr1 1 2 100 1 0 -chr1 3 4 0 0 1 +chr1 3 4 0 0 1 \ No newline at end of file diff --git a/src/tests/data/methf_cpgrep b/src/tests/data/methf_cpgrep index 14a4a7b..1c681b9 100644 --- a/src/tests/data/methf_cpgrep +++ b/src/tests/data/methf_cpgrep @@ -1,2 +1,2 @@ chr1 0 + 1 0 CG CGH -chr1 2 - 0 1 CG CGH +chr1 2 - 0 1 CG CGH \ No newline at end of file diff --git a/src/tests/data/methf_methyldackel b/src/tests/data/methf_methyldackel index f11d9f9..17a7a59 100644 --- a/src/tests/data/methf_methyldackel +++ b/src/tests/data/methf_methyldackel @@ -1,2 +1,2 @@ chr1 0 1 100 1 1 -chr1 2 3 0 0 1 +chr1 2 3 0 0 1 \ No newline at end of file diff --git a/src/tests/data/region.bed b/src/tests/data/region.bed index 92d666d..ac5c582 100644 --- a/src/tests/data/region.bed +++ b/src/tests/data/region.bed @@ -1 +1 @@ -chr1 100 200 +chr1 100 200 \ No newline at end of file diff --git a/src/tests/test_reader.rs b/src/tests/test_reader.rs index 115075c..b720891 100644 --- a/src/tests/test_reader.rs +++ b/src/tests/test_reader.rs @@ -51,11 +51,13 @@ mod tests { let bismarkcov = test_path.parent().unwrap().join("data/methf_bismarkcov"); let cpgrep = test_path.parent().unwrap().join("data/methf_cpgrep"); let methyldackel = test_path.parent().unwrap().join("data/methf_methyldackel"); + let bedmethyl = test_path.parent().unwrap().join("data/methf_bedmethyl"); let pairs: Vec<(&str, MethFileType)> = vec![ (allcoolsf.to_str().unwrap(), MethFileType::AllCools), (bismarkcov.to_str().unwrap(), MethFileType::BismarkCov), (cpgrep.to_str().unwrap(), MethFileType::BismarkCpGReport), (methyldackel.to_str().unwrap(), MethFileType::MethylDackel), + (bedmethyl.to_str().unwrap(), MethFileType::BedMethyl), ]; for (f, expected) in pairs { let reader = std::io::BufReader::new(std::fs::File::open(f).unwrap()); @@ -74,7 +76,7 @@ mod tests { let bismarkcov = test_path.parent().unwrap().join("data/methf_bismarkcov"); let cpgrep = test_path.parent().unwrap().join("data/methf_cpgrep"); let methyldackel = test_path.parent().unwrap().join("data/methf_methyldackel"); - + let bedmethyl = test_path.parent().unwrap().join("data/methf_bedmethyl"); let exp_mr = vec![ MethRegion { chrom: "chr1".to_string(), pos: 0, meth: 1, total: 1 }, MethRegion { chrom: "chr1".to_string(), pos: 2, meth: 0, total: 1 }, @@ -85,6 +87,7 @@ mod tests { bismarkcov.to_str().unwrap(), cpgrep.to_str().unwrap(), methyldackel.to_str().unwrap(), + bedmethyl.to_str().unwrap(), ] { let methregions = read_meth(f); assert_eq!(methregions, exp_mr); diff --git a/src/types.rs b/src/types.rs index fc5ca40..7529f2a 100644 --- a/src/types.rs +++ b/src/types.rs @@ -18,13 +18,17 @@ pub struct MethRegion { pub enum MethFileType { AllCools, MethylDackel, - //BismarkBedgraph, + BedMethyl, BismarkCov, BismarkCpGReport, } impl MethFileType { pub fn parse_line(&self, line: &str) -> Result, String> { + // Ignore any comment lines (#) + if line.starts_with('#') { + return Ok(None); + }; match self { MethFileType::AllCools => { let fields: Vec<&str> = line.split('\t').collect(); @@ -81,6 +85,18 @@ impl MethFileType { Ok(None) } } + MethFileType::BedMethyl => { + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() != 18 { + return Err(format!("Invalid BedMethyl line: {}", line)); + } + let chrom = fields[0].to_string(); + // BedMethyl is 0-based. + let pos = fields[1].parse::().map_err(|e| format!("Invalid position in BedMethyl line: {}: {}", line, e))?; + let meth = fields[11].parse::().map_err(|e| format!("Invalid methylated count in BedMethyl line: {}: {}", line, e))?; + let total = fields[9].parse::().map_err(|e| format!("Invalid unmethylated count in BedMethyl line: {}: {}", line, e))?; + Ok(Some(MethRegion { chrom, pos, meth, total })) + } } } } \ No newline at end of file From e3ab616480306931500a80e9a6abff5948639642 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 19 Sep 2025 13:39:41 +0200 Subject: [PATCH 2/3] include pytest for bedmethyl files --- python/tests/conftest.py | 47 ++++++++++++++++++++++++++++++++++++ python/tests/test_parsing.py | 1 + 2 files changed, 48 insertions(+) diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 9c31acf..ca32f7d 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -200,6 +200,53 @@ def bismarkrep_path(tmp_path): f.write('chr1\t230\t+\t0\t100\tCG\tCGN\n') return methpath + +@pytest.fixture +def bedmethyl_path(tmp_path): + ''' + simulate bedmethyl files. + tsv with chrom, start, end, name, score strand, + thickstart, thickend, color, validcov, %mod, count_mod + count_can, count_other, count_delete, count_fail, count_diff, count_nocall + columns. + these are 0-based. + ''' + methpath = tmp_path / "bedmethyl" + methpath.mkdir() + + # Cell 1 - avg. meth in gene 1 = 50% + with gzip.open(methpath / "cell1.WCGN.bed.gz", "wt") as f: + f.write('chr1\t20\t21\tm\t100\t+\t20\t21\t255,0,0\t100\t25.00\t25\t75\t0\t0\t0\t0\t0\n') + f.write('chr1\t50\t51\tm\t100\t+\t50\t51\t255,0,0\t100\t75.00\t75\t25\t0\t0\t0\t0\t0\n') + + # Cell 2 - avg. meth in gene 2 = 62.5% + with gzip.open(methpath / "cell2.WCGN.bed.gz", "wt") as f: + f.write('chr1\t221\t222\tm\t4\t+\t221\t222\t255,0,0\t4\t50.00\t2\t2\t0\t0\t0\t0\t0\n') + f.write('chr1\t244\t245\tm\t16\t+\t224\t225\t255,0,0\t16\t75.00\t12\t4\t0\t0\t0\t0\t0\n') + + # Cell 3 - avg. meth in gene 1 = 100%, avg. meth in gene 2 = 0% + with gzip.open(methpath / "cell3.WCGN.bed.gz", "wt") as f: + f.write('chr1\t20\t21\tm\t21\t+\t20\t21\t255,0,0\t21\t100.00\t21\t0\t0\t0\t0\t0\t0\n') + f.write('chr1\t50\t51\tm\t35\t+\t50\t51\t255,0,0\t35\t100.00\t35\t0\t0\t0\t0\t0\t0\n') + f.write('chr1\t221\t222\tm\t100\t+\t221\t222\t255,0,0\t100\t0.00\t0\t100\t0\t0\t0\t0\t0\n') + + # Cell 1 - avg. meth in gene 1 = 25% + with gzip.open(methpath / "cell1.GCHN.bed.gz", "wt") as f: + f.write('chr1\t30\t31\tm\t100\t+\t30\t31\t255,0,0\t100\t25.00\t25\t75\t0\t0\t0\t0\t0\n') + f.write('chr1\t40\t41\tm\t4\t+\t40\t41\t255,0,0\t4\t25.00\t1\t3\t0\t0\t0\t0\t0\n') + + # Cell 2 - avg. meth in gene 2 = 44% + with gzip.open(methpath / "cell2.GCHN.bed.gz", "wt") as f: + f.write('chr1\t230\t231\tm\t100\t+\t230\t231\t255,0,0\t100\t44.00\t44\t56\t0\t0\t0\t0\t0\n') + + # Cell 3 - avg. meth in gene 1 = 100%, avg. meth in gene 2 = 0% + with gzip.open(methpath / "cell3.GCHN.bed.gz", "wt") as f: + f.write('chr1\t30\t31\tm\t100\t+\t30\t31\t255,0,0\t100\t100.00\t100\t0\t0\t0\t0\t0\t0\n') + f.write('chr1\t40\t41\tm\t4\t+\t40\t41\t255,0,0\t4\t100.00\t4\t0\t0\t0\t0\t0\t0\n') + f.write('chr1\t230\t231\tm\t100\t+\t230\t231\t255,0,0\t100\t0.00\t0\t100\t0\t0\t0\t0\t0\n') + return methpath + + @pytest.fixture def rna_path(tmp_path): ''' diff --git a/python/tests/test_parsing.py b/python/tests/test_parsing.py index 5200069..325a2d2 100644 --- a/python/tests/test_parsing.py +++ b/python/tests/test_parsing.py @@ -12,6 +12,7 @@ class TestAllcools: ("methyldackel_path", ('*WCGN.bedgraph', '*GCHN.bedgraph')), ("bismarkcov_path", ('*WCGN.bismark.cov', '*GCHN.bismark.cov')), ("bismarkrep_path", ('*WCGN.bismark.rep.gz', '*GCHN.bismark.rep.gz')), + ("bedmethyl_path", ('*WCGN*bed.gz', '*GCHN*bed.gz')), ], indirect=["dynamic_methylation_path"] ) From 4a9f84876f27c87dca200765b19e17363988e545 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 19 Sep 2025 13:43:20 +0200 Subject: [PATCH 3/3] docs update for bedmethyl --- docs/content/notes.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/content/notes.rst b/docs/content/notes.rst index fca0a7b..2101270 100644 --- a/docs/content/notes.rst +++ b/docs/content/notes.rst @@ -5,14 +5,17 @@ Input files =========== A couple of things need to be kept in mind with regards to methylation data. -At this point, four different file types for methylation data are supported: +At this point, five different file types for methylation data are supported: - Allcools files - MethylDackel bedgraph files - Bismark coverage files - Bismark CpG report files + - BedMethyl files -Note that both MethylDackel bedgraph files and Bismark CpG report files are assumed to be 0-based encoded. +Note that MethylDackel bedgraph files, Bismark CpG report files and BedMethyl files are assumed to be 0-based start encoded (and 1-based end, if applicable). The Allcools files and Bismark coverage files are assumed to be 1-based encoded. For the Bismark coverage files, keep in mind that `bismark_methylation_extractor` has a flag to output 0-based files, so pay attention that this is correct. +For BedMethyl files, please note that _no_ checks are performed to ensure that only one modification type is present. +If you have multiple modification types (i.e. more then one 'name' in column 4), please split htem into separate files (you can include them in the output by specific multiple methylation_path/methylation_pattern combinations). For the RNA-part, for now only featureCounts tables are supported. \ No newline at end of file