Make API and server compatible with OpenAI API

vmpuri · vmpuri · commit e28c75734bd4 · 2024-08-19T14:42:17.000-07:00
diff --git a/README.md b/README.md
@@ -181,16 +181,6 @@ This mode generates text based on an input prompt.
 python3 torchchat.py generate llama3.1 --prompt "write me a story about a boy and his bear"
 ```
 
-### Browser
-This mode allows you to chat with the model using a UI in your browser
-Running the command automatically open a tab in your browser.
-
-[skip default]: begin
-
-```
-streamlit run torchchat.py -- browser llama3.1
-```
-
 [skip default]: end
 
 ### Server
@@ -252,6 +242,19 @@ curl http://127.0.0.1:5000/v1/chat \
 
 </details>
 
+### Browser
+This command opens a basic browser interface for local chat by querying a local server.
+
+First, follow the steps in the Server section above to start a local server. Then, in another terminal, launch the interface. Running the following will open a tab in your browser.
+
+[skip default]: begin
+
+```
+streamlit run browser/browser.py
+```
+
+Use the "Max Response Tokens" slider to limit the maximum number of tokens generated by the model for each response. Click the "Reset Chat" button to remove the message history and start a fresh chat.
+
 
 ## Desktop/Server Execution
 
diff --git a/browser/browser.py b/browser/browser.py
@@ -1,40 +1,66 @@
 import streamlit as st
 from openai import OpenAI
 
+st.title("torchchat")
+
+start_state = [
+    {
+        "role": "system",
+        "content": "You're an assistant. Answer questions directly, be brief, and have fun.",
+    },
+    {"role": "assistant", "content": "How can I help you?"},
+]
+
 with st.sidebar:
-    openai_api_key = st.text_input(
-        "OpenAI API Key", key="chatbot_api_key", type="password"
+    response_max_tokens = st.slider(
+        "Max Response Tokens", min_value=10, max_value=1000, value=250, step=10
     )
-    "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)"
-    "[View the source code](https://github.com/streamlit/llm-examples/blob/main/Chatbot.py)"
-    "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/streamlit/llm-examples?quickstart=1)"
-
-st.title("💬 Chatbot")
+    if st.button("Reset Chat", type="primary"):
+        st.session_state["messages"] = start_state
 
 if "messages" not in st.session_state:
-    st.session_state["messages"] = [
-        {
-            "role": "system",
-            "content": "You're an assistant. Be brief, no yapping. Use as few words as possible to respond to the users' questions.",
-        },
-        {"role": "assistant", "content": "How can I help you?"},
-    ]
+    st.session_state["messages"] = start_state
+
 
 for msg in st.session_state.messages:
     st.chat_message(msg["role"]).write(msg["content"])
 
 if prompt := st.chat_input():
     client = OpenAI(
-        # This is the default and can be omitted
         base_url="http://127.0.0.1:5000/v1",
-        api_key="YOURMOTHER",
+        api_key="813",  # The OpenAI API requires an API key, but since we don't consume it, this can be any non-empty string.
     )
 
     st.session_state.messages.append({"role": "user", "content": prompt})
     st.chat_message("user").write(prompt)
-    response = client.chat.completions.create(
-        model="stories15m", messages=st.session_state.messages, max_tokens=64
-    )
-    msg = response.choices[0].message.content
-    st.session_state.messages.append({"role": "assistant", "content": msg})
-    st.chat_message("assistant").write(msg)
+
+    with st.chat_message("assistant"), st.status(
+        "Generating... ", expanded=True
+    ) as status:
+
+        def get_streamed_completion(completion_generator):
+            start = time.time()
+            tokcount = 0
+            for chunk in completion_generator:
+                tokcount += 1
+                yield chunk.choices[0].delta.content
+
+            status.update(
+                label="Done, averaged {:.2f} tokens/second".format(
+                    tokcount / (time.time() - start)
+                ),
+                state="complete",
+            )
+
+        response = st.write_stream(
+            get_streamed_completion(
+                client.chat.completions.create(
+                    model="llama3",
+                    messages=st.session_state.messages,
+                    max_tokens=response_max_tokens,
+                    stream=True,
+                )
+            )
+        )[0]
+
+    st.session_state.messages.append({"role": "assistant", "content": response})
diff --git a/server.py b/server.py
@@ -75,9 +75,6 @@ def chunk_processor(chunked_completion_generator):
                         next_tok = ""
                     print(next_tok, end="", flush=True)
                     yield f"data:{json.dumps(_del_none(asdict(chunk)))}\n\n"
-                    # wasda = json.dumps(asdict(chunk))
-                    # print(wasda)
-                    # yield wasda
 
             resp = Response(
                 chunk_processor(gen.chunked_completion(req)),