|
38 | 38 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
39 | 39 | // OF THE POSSIBILITY OF SUCH DAMAGE.
|
40 | 40 |
|
41 |
| -// ignore-pretty very bad with line comments |
42 | 41 | // ignore-android doesn't terminate?
|
43 | 42 |
|
44 |
| -#![feature(slicing_syntax)] |
| 43 | +#![feature(slicing_syntax, asm, if_let, tuple_indexing)] |
45 | 44 |
|
46 |
| -use std::iter::range_step; |
47 |
| -use std::io::{stdin, stdout, File}; |
| 45 | +extern crate libc; |
48 | 46 |
|
49 |
| -static LINE_LEN: uint = 60; |
| 47 | +use std::io::stdio::{stdin_raw, stdout_raw}; |
| 48 | +use std::sync::{Future}; |
| 49 | +use std::num::{div_rem}; |
| 50 | +use std::ptr::{copy_memory}; |
| 51 | +use std::io::{IoResult, EndOfFile}; |
| 52 | +use std::slice::raw::{mut_buf_as_slice}; |
50 | 53 |
|
51 |
| -fn make_complements() -> [u8, ..256] { |
52 |
| - let transforms = [ |
53 |
| - ('A', 'T'), ('C', 'G'), ('G', 'C'), ('T', 'A'), |
54 |
| - ('U', 'A'), ('M', 'K'), ('R', 'Y'), ('W', 'W'), |
55 |
| - ('S', 'S'), ('Y', 'R'), ('K', 'M'), ('V', 'B'), |
56 |
| - ('H', 'D'), ('D', 'H'), ('B', 'V'), ('N', 'N'), |
57 |
| - ('\n', '\n')]; |
58 |
| - let mut complements: [u8, ..256] = [0, ..256]; |
59 |
| - for (i, c) in complements.iter_mut().enumerate() { |
60 |
| - *c = i as u8; |
| 54 | +use shared_memory::{SharedMemory}; |
| 55 | + |
| 56 | +mod tables { |
| 57 | + use std::sync::{Once, ONCE_INIT}; |
| 58 | + |
| 59 | + /// Lookup tables. |
| 60 | + static mut CPL16: [u16, ..1 << 16] = [0, ..1 << 16]; |
| 61 | + static mut CPL8: [u8, ..1 << 8] = [0, ..1 << 8]; |
| 62 | + |
| 63 | + /// Generates the tables. |
| 64 | + pub fn get() -> Tables { |
| 65 | + /// To make sure we initialize the tables only once. |
| 66 | + static INIT: Once = ONCE_INIT; |
| 67 | + INIT.doit(|| { |
| 68 | + unsafe { |
| 69 | + for i in range(0, 1 << 8) { |
| 70 | + CPL8[i] = match i as u8 { |
| 71 | + b'A' | b'a' => b'T', |
| 72 | + b'C' | b'c' => b'G', |
| 73 | + b'G' | b'g' => b'C', |
| 74 | + b'T' | b't' => b'A', |
| 75 | + b'U' | b'u' => b'A', |
| 76 | + b'M' | b'm' => b'K', |
| 77 | + b'R' | b'r' => b'Y', |
| 78 | + b'W' | b'w' => b'W', |
| 79 | + b'S' | b's' => b'S', |
| 80 | + b'Y' | b'y' => b'R', |
| 81 | + b'K' | b'k' => b'M', |
| 82 | + b'V' | b'v' => b'B', |
| 83 | + b'H' | b'h' => b'D', |
| 84 | + b'D' | b'd' => b'H', |
| 85 | + b'B' | b'b' => b'V', |
| 86 | + b'N' | b'n' => b'N', |
| 87 | + i => i, |
| 88 | + }; |
| 89 | + } |
| 90 | + |
| 91 | + for (i, v) in CPL16.iter_mut().enumerate() { |
| 92 | + *v = *CPL8.unsafe_get(i & 255) as u16 << 8 | |
| 93 | + *CPL8.unsafe_get(i >> 8) as u16; |
| 94 | + } |
| 95 | + } |
| 96 | + }); |
| 97 | + Tables { _dummy: () } |
61 | 98 | }
|
62 |
| - let lower = 'A' as u8 - 'a' as u8; |
63 |
| - for &(from, to) in transforms.iter() { |
64 |
| - complements[from as uint] = to as u8; |
65 |
| - complements[(from as u8 - lower) as uint] = to as u8; |
| 99 | + |
| 100 | + /// Accessor for the static arrays. |
| 101 | + /// |
| 102 | + /// To make sure that the tables can't be accessed without having been initialized. |
| 103 | + pub struct Tables { |
| 104 | + _dummy: () |
| 105 | + } |
| 106 | + |
| 107 | + impl Tables { |
| 108 | + /// Retreives the complement for `i`. |
| 109 | + pub fn cpl8(self, i: u8) -> u8 { |
| 110 | + // Not really unsafe. |
| 111 | + unsafe { CPL8[i as uint] } |
| 112 | + } |
| 113 | + |
| 114 | + /// Retreives the complement for `i`. |
| 115 | + pub fn cpl16(self, i: u16) -> u16 { |
| 116 | + unsafe { CPL16[i as uint] } |
| 117 | + } |
66 | 118 | }
|
67 |
| - complements |
68 | 119 | }
|
69 | 120 |
|
70 |
| -fn main() { |
71 |
| - let complements = make_complements(); |
72 |
| - let data = if std::os::getenv("RUST_BENCH").is_some() { |
73 |
| - File::open(&Path::new("shootout-k-nucleotide.data")).read_to_end() |
74 |
| - } else { |
75 |
| - stdin().read_to_end() |
76 |
| - }; |
77 |
| - let mut data = data.unwrap(); |
| 121 | +mod shared_memory { |
| 122 | + use std::sync::{Arc}; |
| 123 | + use std::mem::{transmute}; |
| 124 | + use std::raw::{Slice}; |
78 | 125 |
|
79 |
| - for seq in data.as_mut_slice().split_mut(|c| *c == '>' as u8) { |
80 |
| - // skip header and last \n |
81 |
| - let begin = match seq.iter().position(|c| *c == '\n' as u8) { |
82 |
| - None => continue, |
83 |
| - Some(c) => c |
84 |
| - }; |
85 |
| - let len = seq.len(); |
86 |
| - let seq = seq[mut begin+1..len-1]; |
87 |
| - |
88 |
| - // arrange line breaks |
89 |
| - let len = seq.len(); |
90 |
| - let off = LINE_LEN - len % (LINE_LEN + 1); |
91 |
| - for i in range_step(LINE_LEN, len, LINE_LEN + 1) { |
92 |
| - for j in std::iter::count(i, -1).take(off) { |
93 |
| - seq[j] = seq[j - 1]; |
| 126 | + /// Structure for sharing disjoint parts of a vector mutably across tasks. |
| 127 | + pub struct SharedMemory { |
| 128 | + ptr: Arc<Vec<u8>>, |
| 129 | + start: uint, |
| 130 | + len: uint, |
| 131 | + } |
| 132 | + |
| 133 | + impl SharedMemory { |
| 134 | + pub fn new(ptr: Vec<u8>) -> SharedMemory { |
| 135 | + let len = ptr.len(); |
| 136 | + SharedMemory { |
| 137 | + ptr: Arc::new(ptr), |
| 138 | + start: 0, |
| 139 | + len: len, |
94 | 140 | }
|
95 |
| - seq[i - off] = '\n' as u8; |
96 | 141 | }
|
97 | 142 |
|
98 |
| - // reverse complement, as |
99 |
| - // seq.reverse(); for c in seq.iter_mut() { *c = complements[*c] } |
100 |
| - // but faster: |
101 |
| - for (front, back) in two_side_iter(seq) { |
102 |
| - let tmp = complements[*front as uint]; |
103 |
| - *front = complements[*back as uint]; |
104 |
| - *back = tmp; |
| 143 | + pub fn as_mut_slice(&mut self) -> &mut [u8] { |
| 144 | + unsafe { |
| 145 | + transmute(Slice { |
| 146 | + data: self.ptr.as_ptr().offset(self.start as int) as *const u8, |
| 147 | + len: self.len, |
| 148 | + }) |
| 149 | + } |
105 | 150 | }
|
106 |
| - if seq.len() % 2 == 1 { |
107 |
| - let middle = &mut seq[seq.len() / 2]; |
108 |
| - *middle = complements[*middle as uint]; |
| 151 | + |
| 152 | + pub fn len(&self) -> uint { |
| 153 | + self.len |
109 | 154 | }
|
110 |
| - } |
111 | 155 |
|
112 |
| - stdout().write(data.as_slice()).unwrap(); |
| 156 | + pub fn split_at(self, mid: uint) -> (SharedMemory, SharedMemory) { |
| 157 | + assert!(mid <= self.len); |
| 158 | + let left = SharedMemory { |
| 159 | + ptr: self.ptr.clone(), |
| 160 | + start: self.start, |
| 161 | + len: mid, |
| 162 | + }; |
| 163 | + let right = SharedMemory { |
| 164 | + ptr: self.ptr, |
| 165 | + start: self.start + mid, |
| 166 | + len: self.len - mid, |
| 167 | + }; |
| 168 | + (left, right) |
| 169 | + } |
| 170 | + |
| 171 | + /// Resets the object so that it covers the whole range of the contained vector. |
| 172 | + /// |
| 173 | + /// You must not call this method if `self` is not the only reference to the |
| 174 | + /// shared memory. |
| 175 | + /// |
| 176 | + /// FIXME: If `Arc` had a method to check if the reference is unique, then we |
| 177 | + /// wouldn't need the `unsafe` here. |
| 178 | + /// |
| 179 | + /// FIXME: If `Arc` had a method to unwrap the contained value, then we could |
| 180 | + /// simply unwrap here. |
| 181 | + pub unsafe fn reset(self) -> SharedMemory { |
| 182 | + let len = self.ptr.len(); |
| 183 | + SharedMemory { |
| 184 | + ptr: self.ptr, |
| 185 | + start: 0, |
| 186 | + len: len, |
| 187 | + } |
| 188 | + } |
| 189 | + } |
113 | 190 | }
|
114 | 191 |
|
115 |
| -pub struct TwoSideIter<'a, T: 'a> { |
116 |
| - first: *mut T, |
117 |
| - last: *mut T, |
118 |
| - marker: std::kinds::marker::ContravariantLifetime<'a>, |
119 |
| - marker2: std::kinds::marker::NoCopy |
| 192 | + |
| 193 | +/// Reads all remaining bytes from the stream. |
| 194 | +fn read_to_end<R: Reader>(r: &mut R) -> IoResult<Vec<u8>> { |
| 195 | + const CHUNK: uint = 64 * 1024; |
| 196 | + |
| 197 | + let mut vec = Vec::with_capacity(1024 * 1024); |
| 198 | + loop { |
| 199 | + if vec.capacity() - vec.len() < CHUNK { |
| 200 | + let cap = vec.capacity(); |
| 201 | + let mult = if cap < 256 * 1024 * 1024 { |
| 202 | + // FIXME (mahkoh): Temporary workaround for jemalloc on linux. Replace |
| 203 | + // this by 2x once the jemalloc preformance issue has been fixed. |
| 204 | + 16 |
| 205 | + } else { |
| 206 | + 2 |
| 207 | + }; |
| 208 | + vec.reserve_exact(mult * cap); |
| 209 | + } |
| 210 | + unsafe { |
| 211 | + let ptr = vec.as_mut_ptr().offset(vec.len() as int); |
| 212 | + match mut_buf_as_slice(ptr, CHUNK, |s| r.read(s)) { |
| 213 | + Ok(n) => { |
| 214 | + let len = vec.len(); |
| 215 | + vec.set_len(len + n); |
| 216 | + }, |
| 217 | + Err(ref e) if e.kind == EndOfFile => break, |
| 218 | + Err(e) => return Err(e), |
| 219 | + } |
| 220 | + } |
| 221 | + } |
| 222 | + Ok(vec) |
120 | 223 | }
|
121 | 224 |
|
122 |
| -pub fn two_side_iter<'a, T>(slice: &'a mut [T]) -> TwoSideIter<'a, T> { |
123 |
| - let len = slice.len(); |
124 |
| - let first = slice.as_mut_ptr(); |
125 |
| - let last = if len == 0 { |
126 |
| - first |
127 |
| - } else { |
128 |
| - unsafe { first.offset(len as int - 1) } |
| 225 | +/// Finds the first position at which `b` occurs in `s`. |
| 226 | +fn memchr(h: &[u8], n: u8) -> Option<uint> { |
| 227 | + use libc::{c_void, c_int, size_t}; |
| 228 | + extern { |
| 229 | + fn memchr(h: *const c_void, n: c_int, s: size_t) -> *mut c_void; |
| 230 | + } |
| 231 | + let res = unsafe { |
| 232 | + memchr(h.as_ptr() as *const c_void, n as c_int, h.len() as size_t) |
129 | 233 | };
|
130 |
| - |
131 |
| - TwoSideIter { |
132 |
| - first: first, |
133 |
| - last: last, |
134 |
| - marker: std::kinds::marker::ContravariantLifetime, |
135 |
| - marker2: std::kinds::marker::NoCopy |
| 234 | + if res.is_null() { |
| 235 | + None |
| 236 | + } else { |
| 237 | + Some(res as uint - h.as_ptr() as uint) |
136 | 238 | }
|
137 | 239 | }
|
138 | 240 |
|
139 |
| -impl<'a, T> Iterator<(&'a mut T, &'a mut T)> for TwoSideIter<'a, T> { |
140 |
| - fn next(&mut self) -> Option<(&'a mut T, &'a mut T)> { |
141 |
| - if self.first < self.last { |
142 |
| - let result = unsafe { (&mut *self.first, &mut *self.last) }; |
143 |
| - self.first = unsafe { self.first.offset(1) }; |
144 |
| - self.last = unsafe { self.last.offset(-1) }; |
145 |
| - Some(result) |
146 |
| - } else { |
147 |
| - None |
| 241 | +/// Length of a normal line without the terminating \n. |
| 242 | +const LINE_LEN: uint = 60; |
| 243 | + |
| 244 | +/// Compute the reverse complement. |
| 245 | +fn reverse_complement(mut view: SharedMemory, tables: tables::Tables) { |
| 246 | + // Drop the last newline |
| 247 | + let seq = view.as_mut_slice().init_mut(); |
| 248 | + let len = seq.len(); |
| 249 | + let off = LINE_LEN - len % (LINE_LEN + 1); |
| 250 | + let mut i = LINE_LEN; |
| 251 | + while i < len { |
| 252 | + unsafe { |
| 253 | + copy_memory(seq.as_mut_ptr().offset((i - off + 1) as int), |
| 254 | + seq.as_ptr().offset((i - off) as int), off); |
| 255 | + *seq.unsafe_mut(i - off) = b'\n'; |
| 256 | + } |
| 257 | + i += LINE_LEN + 1; |
| 258 | + } |
| 259 | + |
| 260 | + let (div, rem) = div_rem(len, 4); |
| 261 | + unsafe { |
| 262 | + let mut left = seq.as_mut_ptr() as *mut u16; |
| 263 | + // This is slow if len % 2 != 0 but still faster than bytewise operations. |
| 264 | + let mut right = seq.as_mut_ptr().offset(len as int - 2) as *mut u16; |
| 265 | + let end = left.offset(div as int); |
| 266 | + while left != end { |
| 267 | + let tmp = tables.cpl16(*left); |
| 268 | + *left = tables.cpl16(*right); |
| 269 | + *right = tmp; |
| 270 | + left = left.offset(1); |
| 271 | + right = right.offset(-1); |
| 272 | + } |
| 273 | + |
| 274 | + let end = end as *mut u8; |
| 275 | + match rem { |
| 276 | + 1 => *end = tables.cpl8(*end), |
| 277 | + 2 => { |
| 278 | + let tmp = tables.cpl8(*end); |
| 279 | + *end = tables.cpl8(*end.offset(1)); |
| 280 | + *end.offset(1) = tmp; |
| 281 | + }, |
| 282 | + 3 => { |
| 283 | + *end.offset(1) = tables.cpl8(*end.offset(1)); |
| 284 | + let tmp = tables.cpl8(*end); |
| 285 | + *end = tables.cpl8(*end.offset(2)); |
| 286 | + *end.offset(2) = tmp; |
| 287 | + }, |
| 288 | + _ => { }, |
148 | 289 | }
|
149 | 290 | }
|
150 | 291 | }
|
| 292 | + |
| 293 | +fn main() { |
| 294 | + let mut data = SharedMemory::new(read_to_end(&mut stdin_raw()).unwrap()); |
| 295 | + let tables = tables::get(); |
| 296 | + |
| 297 | + let mut futures = vec!(); |
| 298 | + loop { |
| 299 | + let (_, mut tmp_data) = match memchr(data.as_mut_slice(), b'\n') { |
| 300 | + Some(i) => data.split_at(i + 1), |
| 301 | + _ => break, |
| 302 | + }; |
| 303 | + let (view, tmp_data) = match memchr(tmp_data.as_mut_slice(), b'>') { |
| 304 | + Some(i) => tmp_data.split_at(i), |
| 305 | + None => { |
| 306 | + let len = tmp_data.len(); |
| 307 | + tmp_data.split_at(len) |
| 308 | + }, |
| 309 | + }; |
| 310 | + futures.push(Future::spawn(proc() reverse_complement(view, tables))); |
| 311 | + data = tmp_data; |
| 312 | + } |
| 313 | + |
| 314 | + for f in futures.iter_mut() { |
| 315 | + f.get(); |
| 316 | + } |
| 317 | + |
| 318 | + // Not actually unsafe. If Arc had a way to check uniqueness then we could do that in |
| 319 | + // `reset` and it would tell us that, yes, it is unique at this point. |
| 320 | + data = unsafe { data.reset() }; |
| 321 | + |
| 322 | + stdout_raw().write(data.as_mut_slice()).unwrap(); |
| 323 | +} |
0 commit comments