ggml-org · ggerganov · Mar 25, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -72,6 +72,7 @@ models-mnt
 /batched-bench
 /export-lora
 /finetune
+/retrieval
 /speculative
 /parallel
 /train-text-from-scratch

diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
+	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -794,6 +794,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -276,6 +276,43 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
         }
         return true;
     }
+    if (arg == "--context-files") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        while(true) {
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            // store the external file name in params
+            params.context_files.push_back(argv[i]);
+            if (i + 1 >= argc || argv[i + 1][0] == '-') {
+                break;
+            }
+            i++;
+        }
+        return true;
+    }
+    if (arg == "--chunk-size") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.chunk_size = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--chunk-separator") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.chunk_separator = argv[i];
+        return true;
+    }
     if (arg == "-n" || arg == "--n-predict") {
         if (++i >= argc) {
             invalid_param = true;
@@ -1282,6 +1319,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        prompt file to start generation.\n");
     printf("  -bf FNAME, --binary-file FNAME\n");
     printf("                        binary file containing multiple choice tasks.\n");
+    printf("  --context-files FNAME1 FNAME2...\n");
+    printf("                        files containing context to embed.\n");
+    printf("  --chunk-size N        minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
+    printf("  --chunk-separator STRING\n");
+    printf("                        string to separate chunks (default: \"\\n\")\n");
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  logical maximum batch size (default: %d)\n", params.n_batch);

@@ -79,6 +79,9 @@ struct gpt_params {
     float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
     int32_t yarn_orig_ctx         = 0;     // YaRN original context length
     float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    std::vector<std::string> context_files;// context files to embed
+    int32_t chunk_size            = 64;    // chunk size for context embedding
+    std::string chunk_separator   = "\n";  // chunk separator for context embedding
 
     ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -34,6 +34,7 @@ else()
     add_subdirectory(perplexity)
     add_subdirectory(quantize)
     add_subdirectory(quantize-stats)
+    add_subdirectory(retrieval)
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
     add_subdirectory(passkey)

@@ -0,0 +1,5 @@
+set(TARGET retrieval)
+add_executable(${TARGET} retrieval.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,6 @@
+# llama.cpp/examples/retrieval
+
+Demonstration of simple retrieval technique based on cosin similarity
+
+More info:
+https://github.com/ggerganov/llama.cpp/pull/6193