ProgrammingRust · simonsan · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,8 +2,25 @@
 name = "fingertips"
 version = "0.1.0"
 authors = ["Jason Orendorff <[email protected]>"]
-edition = "2018"
+edition = "2021"
 
 [dependencies]
 argparse = "0.2.1"
-byteorder = "0.5.3"
+byteorder = "1.5.0"
+displaydoc = "0.2.4"
+thiserror = "1.0.58"
+
+[lints.rust]
+unsafe_code = "forbid"
+missing_docs = "warn"
+rust_2018_idioms = "warn"
+trivial_casts = "warn"
+unused_lifetimes = "warn"
+unused_qualifications = "warn"
+bad_style = "warn"
+dead_code = "warn"
+
+[lints.clippy]
+all = "warn"
+unwrap_used = "warn"
+expect_used = "warn"
diff --git a/src/bin/fingertips.rs b/src/bin/fingertips.rs
@@ -0,0 +1,45 @@
+//! `fingertips` creates an inverted index for a set of text files.
+//!
+//! Most of the actual work is done by the modules `index`, `read`, `write`,
+//! and `merge`.  In this file, `main.rs`, we put the pieces together in two
+//! different ways.
+//!
+//! *   `run_single_threaded` simply does everything in one thread, in
+//!     the most straightforward possible way.
+//!
+//! *   Then, we break the work into a five-stage pipeline so that we can run
+//!     it on multiple CPUs. `run_pipeline` puts the five stages together.
+//!
+//! This is the `main` function that handles command-line arguments. It calls one
+//! of the two functions above to do the work.
+
+use argparse::{ArgumentParser, Collect, StoreTrue};
+use fingertips::run;
+
+fn main() {
+    let mut single_threaded = false;
+    let mut filenames = vec![];
+
+    {
+        let mut ap = ArgumentParser::new();
+        ap.set_description("Make an inverted index for searching documents.");
+        _ = ap.refer(&mut single_threaded).add_option(
+            &["-1", "--single-threaded"],
+            StoreTrue,
+            "Do all the work on a single thread.",
+        );
+        _ = ap.refer(&mut filenames).add_argument(
+            "filenames",
+            Collect,
+            "Names of files/directories to index. \
+                For directories, all .txt files immediately \
+                under the directory are indexed.",
+        );
+        ap.parse_args_or_exit();
+    }
+
+    match run(filenames, single_threaded) {
+        Ok(()) => {}
+        Err(err) => println!("error: {err}"),
+    }
+}
diff --git a/src/error.rs b/src/error.rs
@@ -0,0 +1,55 @@
+use std::error::Error;
+
+/// Result type that is being returned from methods that can fail and thus have [`FingertipsError`]s.
+pub type FingertipsResult<T> = Result<T, FingertipsError>;
+
+/// Errors that can result from Fingertips.
+// [`Error`] is public, but opaque and easy to keep compatible.
+#[derive(thiserror::Error, Debug)]
+#[error(transparent)]
+pub struct FingertipsError(#[from] FingertipsErrorKind);
+
+// Accessors for anything we do want to expose publicly.
+impl FingertipsError {
+    /// Expose the inner error kind.
+    ///
+    /// This is useful for matching on the error kind.
+    pub fn into_inner(self) -> FingertipsErrorKind {
+        self.0
+    }
+}
+
+/// [`FingertipsErrorKind`] describes the errors that can happen while executing a high-level command.
+///
+/// This is a non-exhaustive enum, so additional variants may be added in future. It is
+/// recommended to match against the wildcard `_` instead of listing all possible variants,
+/// to avoid problems when new variants are added.
+#[non_exhaustive]
+#[derive(thiserror::Error, Debug, displaydoc::Display)]
+pub enum FingertipsErrorKind {
+    /// An error occurred while reading from or writing to a file.
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+    /// An error occurred while parsing a file
+    TermEmpty,
+    /// An error occured in the algorithm
+    AlgorithmError,
+    /// No entry to move
+    NoEntryToMove,
+    /// Computer not big enough to hold index entry, you may be on 32bit platform
+    PlatformLimitExceeded,
+}
+
+trait FingertipsErrorMarker: Error {}
+
+// impl FingertipsErrorMarker for FingertipErrorsInTheCodeBase {}
+
+impl<E> From<E> for FingertipsError
+where
+    E: FingertipsErrorMarker,
+    FingertipsErrorKind: From<E>,
+{
+    fn from(value: E) -> Self {
+        Self(FingertipsErrorKind::from(value))
+    }
+}
diff --git a/src/index.rs b/src/index.rs
@@ -4,8 +4,10 @@
 //! `InMemoryIndex` can be used to do that, up to the size of the machine's
 //! memory.
 
-use std::collections::HashMap;
 use byteorder::{LittleEndian, WriteBytesExt};
+use std::collections::HashMap;
+
+use crate::error::{FingertipsErrorKind, FingertipsResult};
 
 /// Break a string into words.
 fn tokenize(text: &str) -> Vec<&str> {
@@ -21,6 +23,7 @@ fn tokenize(text: &str) -> Vec<&str> {
 /// answer simple search queries. And you can use the `read`, `write`, and
 /// `merge` modules to save an in-memory index to disk and merge it with other
 /// indices, producing a large index.
+#[derive(Default, Debug, Clone, PartialEq, Eq)]
 pub struct InMemoryIndex {
     /// The total number of words in the indexed documents.
     pub word_count: usize,
@@ -34,7 +37,7 @@ pub struct InMemoryIndex {
     /// document id in increasing order. This is handy for some algorithms you
     /// might want to run on the index, so we preserve this property wherever
     /// possible.
-    pub map: HashMap<String, Vec<Hit>>
+    pub map: HashMap<String, Vec<Hit>>,
 }
 
 /// A `Hit` indicates that a particular document contains some term, how many
@@ -47,52 +50,60 @@ pub type Hit = Vec<u8>;
 
 impl InMemoryIndex {
     /// Create a new, empty index.
-    pub fn new() -> InMemoryIndex {
-        InMemoryIndex {
-            word_count: 0,
-            map: HashMap::new()
-        }
+    pub fn new() -> Self {
+        Self::default()
     }
 
     /// Index a single document.
     ///
     /// The resulting index contains exactly one `Hit` per term.
-    pub fn from_single_document(document_id: usize, text: String) -> InMemoryIndex {
+    pub fn from_single_document(document_id: usize, text: String) -> FingertipsResult<Self> {
         let document_id = document_id as u32;
-        let mut index = InMemoryIndex::new();
+        let mut index = Self::new();
 
         let text = text.to_lowercase();
         let tokens = tokenize(&text);
+
+        let hits_list = {
+            let mut hits = Vec::with_capacity(4 + 4);
+            hits.write_u32::<LittleEndian>(document_id)
+                .map_err(FingertipsErrorKind::Io)?;
+
+            vec![hits]
+        };
+
         for (i, token) in tokens.iter().enumerate() {
-            let hits =
-                index.map
-                .entry(token.to_string())
-                .or_insert_with(|| {
-                    let mut hits = Vec::with_capacity(4 + 4);
-                    hits.write_u32::<LittleEndian>(document_id).unwrap();
-                    vec![hits]
-                });
-            hits[0].write_u32::<LittleEndian>(i as u32).unwrap();
+            let hits = index
+                .map
+                .entry((*token).to_string())
+                .or_insert(hits_list.clone());
+            hits[0]
+                .write_u32::<LittleEndian>(i as u32)
+                .map_err(FingertipsErrorKind::Io)?;
+
             index.word_count += 1;
         }
 
         if document_id % 100 == 0 {
-            println!("indexed document {}, {} bytes, {} words", document_id, text.len(), index.word_count);
+            println!(
+                "indexed document {}, {} bytes, {} words",
+                document_id,
+                text.len(),
+                index.word_count
+            );
         }
 
-        index
+        Ok(index)
     }
 
     /// Add all search hits from `other` to this index.
     ///
     /// If both `*self` and `other` are sorted by document id, and all document
     /// ids in `other` are greater than every document id in `*self`, then
     /// `*self` remains sorted by document id after merging.
-    pub fn merge(&mut self, other: InMemoryIndex) {
+    pub fn merge(&mut self, other: Self) {
         for (term, hits) in other.map {
-            self.map.entry(term)
-                .or_insert_with(|| vec![])
-                .extend(hits)
+            self.map.entry(term).or_default().extend(hits);
         }
         self.word_count += other.word_count;
     }