SciSharp
diff --git a/‎BotSharp.sln‎
Lines changed: 11 additions & 0 deletions b/‎BotSharp.sln‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/Infrastructure/BotSharp.Abstraction/MLTasks/Settings/LlmModelSetting.cs‎
Lines changed: 19 additions & 0 deletions b/‎src/Infrastructure/BotSharp.Abstraction/MLTasks/Settings/LlmModelSetting.cs‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/Plugins/BotSharp.Plugin.AudioHandler/Functions/HandleAudioRequestFn.cs‎
Lines changed: 0 additions & 135 deletions b/‎src/Plugins/BotSharp.Plugin.AudioHandler/Functions/HandleAudioRequestFn.cs‎
Lines changed: 0 additions & 135 deletions
diff --git a/‎src/Plugins/BotSharp.Plugin.AudioHandler/Functions/ReadAudioFn.cs‎
Lines changed: 171 additions & 0 deletions b/‎src/Plugins/BotSharp.Plugin.AudioHandler/Functions/ReadAudioFn.cs‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs‎
Lines changed: 14 additions & 0 deletions b/‎src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs‎
Lines changed: 14 additions & 0 deletions
@@ -145,6 +145,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Test.RealtimeVoice
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ChartHandler", "src\Plugins\BotSharp.Plugin.ChartHandler\BotSharp.Plugin.ChartHandler.csproj", "{0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ExcelHandler", "src\Plugins\BotSharp.Plugin.ExcelHandler\BotSharp.Plugin.ExcelHandler.csproj", "{FC63C875-E880-D8BB-B8B5-978AB7B62983}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -609,6 +611,14 @@ Global
 		{0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702}.Release|Any CPU.Build.0 = Release|Any CPU
 		{0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702}.Release|x64.ActiveCfg = Release|Any CPU
 		{0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702}.Release|x64.Build.0 = Release|Any CPU
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983}.Debug|x64.Build.0 = Debug|Any CPU
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983}.Release|Any CPU.Build.0 = Release|Any CPU
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983}.Release|x64.ActiveCfg = Release|Any CPU
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -679,6 +689,7 @@ Global
 		{7C0C7D13-D161-4AB0-9C29-83A0F1FF990E} = {32FAFFFE-A4CB-4FEE-BF7C-84518BBC6DCC}
 		{B067B126-88CD-4282-BEEF-7369B64423EF} = {32FAFFFE-A4CB-4FEE-BF7C-84518BBC6DCC}
 		{0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
+		{FC63C875-E880-D8BB-B8B5-978AB7B62983} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {A9969D89-C98B-40A5-A12B-FC87E55B3A19}
 
@@ -62,6 +62,11 @@ public class LlmModelSetting
     /// </summary>
     public ImageSetting? Image { get; set; }
 
+    /// <summary>
+    /// Settings for audio
+    /// </summary>
+    public AudioSetting? Audio { get; set; }
+
     /// <summary>
     /// Settings for llm cost
     /// </summary>
@@ -128,6 +133,20 @@ public class ImageVariationSetting
 }
 #endregion
 
+#region Audio model settings
+public class AudioSetting
+{
+    public AudioTranscriptionSetting? Transcription { get; set; }
+}
+
+public class AudioTranscriptionSetting
+{
+    public float? Temperature { get; set; }
+    public ModelSettingBase? ResponseFormat { get; set; }
+    public ModelSettingBase? Granularity { get; set; }
+}
+#endregion
+
 public class ModelSettingBase
 {
     public string? Default { get; set; }
 
@@ -0,0 +1,171 @@
+namespace BotSharp.Plugin.AudioHandler.Functions;
+
+public class ReadAudioFn : IFunctionCallback
+{
+    public string Name => "util-audio-handle_audio_request";
+    public string Indication => "Reading audio";
+
+    private readonly IServiceProvider _services;
+    private readonly IFileStorageService _fileStorage;
+    private readonly ILogger<ReadAudioFn> _logger;
+    private readonly BotSharpOptions _options;
+    private readonly AudioHandlerSettings _settings;
+
+    private readonly IEnumerable<string> _audioContentTypes = new List<string>
+    {
+        AudioType.mp3.ToFileType(),
+        AudioType.wav.ToFileType(),
+    };
+
+    public ReadAudioFn(
+        IServiceProvider services,
+        ILogger<ReadAudioFn> logger,
+        BotSharpOptions options,
+        AudioHandlerSettings settings,
+        IFileStorageService fileStorage)
+    {
+        _services = services;
+        _logger = logger;
+        _options = options;
+        _settings = settings;
+        _fileStorage = fileStorage;
+    }
+
+    public async Task<bool> Execute(RoleDialogModel message)
+    {
+        var args = JsonSerializer.Deserialize<LlmContextIn>(message.FunctionArgs, _options.JsonSerializerOptions);
+        var conv = _services.GetRequiredService<IConversationService>();
+        var routingCtx = _services.GetRequiredService<IRoutingContext>();
+
+        var wholeDialogs = routingCtx.GetDialogs();
+        if (wholeDialogs.IsNullOrEmpty())
+        {
+            wholeDialogs = conv.GetDialogHistory();
+        }
+
+        var dialogs = AssembleFiles(conv.ConversationId, wholeDialogs);
+        var response = await GetAudioTranscription(dialogs);
+        message.Content = response;
+        dialogs.ForEach(x => x.Files = null);
+        return true;
+    }
+
+    private List<RoleDialogModel> AssembleFiles(string convId, List<RoleDialogModel> dialogs)
+    {
+        if (dialogs.IsNullOrEmpty())
+        {
+            return new List<RoleDialogModel>();
+        }
+
+        var messageId = dialogs.Select(x => x.MessageId).Distinct().ToList();
+        var audioFiles = _fileStorage.GetMessageFiles(convId, messageId, options: new()
+        {
+            Sources = [FileSource.User],
+            ContentTypes = _audioContentTypes
+        });
+
+        foreach (var dialog in dialogs)
+        {
+            var found = audioFiles.Where(x => x.MessageId == dialog.MessageId
+                                           && x.FileSource.IsEqualTo(FileSource.User)).ToList();
+
+            if (found.IsNullOrEmpty() || !dialog.IsFromUser)
+            {
+                continue;
+            }
+
+            dialog.Files = found.Select(x => new BotSharpFile
+            {
+                ContentType = x.ContentType,
+                FileUrl = x.FileUrl,
+                FileStorageUrl = x.FileStorageUrl
+            }).ToList();
+        }
+
+        return dialogs;
+    }
+
+    private async Task<string> GetAudioTranscription(List<RoleDialogModel> dialogs)
+    {
+        var audioCompletion = PrepareModel();
+        var dialog = dialogs.Where(x => !x.Files.IsNullOrEmpty()).LastOrDefault();
+        var transcripts = new List<string>();
+
+        if (dialog != null)
+        {
+            foreach (var file in dialog.Files)
+            {
+                if (string.IsNullOrWhiteSpace(file?.FileStorageUrl))
+                {
+                    continue;
+                }
+
+                var extension = Path.GetExtension(file.FileStorageUrl);
+                var fileName = Path.GetFileName(file.FileStorageUrl);
+                if (!VerifyAudioFileType(fileName))
+                {
+                    continue;
+                }
+
+                var binary = _fileStorage.GetFileBytes(file.FileStorageUrl);
+                using var stream = binary.ToStream();
+                stream.Position = 0;
+
+                var result = await audioCompletion.TranscriptTextAsync(stream, fileName);
+                transcripts.Add(result);
+                stream.Close();
+                await Task.Delay(100);
+            }
+        }
+        
+
+        if (transcripts.IsNullOrEmpty())
+        {
+            var msg = "No audio is found in the chat.";
+            _logger.LogWarning(msg);
+            transcripts.Add(msg);
+        }
+
+        return string.Join("\r\n\r\n", transcripts);
+    }
+
+    private IAudioTranscription PrepareModel()
+    {
+        var (provider, model) = GetLlmProviderModel();
+        return CompletionProvider.GetAudioTranscriber(_services, provider: provider, model: model);
+    }
+
+    private bool VerifyAudioFileType(string fileName)
+    {
+        var extension = Path.GetExtension(fileName).TrimStart('.').ToLower();
+        return Enum.TryParse<AudioType>(extension, out _)
+                    || !string.IsNullOrEmpty(FileUtility.GetFileContentType(fileName));
+    }
+
+    private (string, string) GetLlmProviderModel()
+    {
+        var state = _services.GetRequiredService<IConversationStateService>();
+        var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
+
+        var provider = state.GetState("audio_read_llm_provider");
+        var model = state.GetState("audio_read_llm_provider");
+
+        if (!string.IsNullOrEmpty(provider) && !string.IsNullOrEmpty(model))
+        {
+            return (provider, model);
+        }
+
+        provider = _settings?.Audio?.Reading?.LlmProvider;
+        model = _settings?.Audio?.Reading?.LlmModel;
+
+        if (!string.IsNullOrEmpty(provider) && !string.IsNullOrEmpty(model))
+        {
+            return (provider, model);
+        }
+
+        provider = "openai";
+        model = "gpt-4o-mini-transcribe";
+
+        return (provider, model);
+    }
+}
@@ -1,5 +1,19 @@
+using BotSharp.Abstraction.Models;
+
 namespace BotSharp.Plugin.AudioHandler.Settings;
 
 public class AudioHandlerSettings
+{
+    public AudioSettings? Audio { get; set; }
+}
+
+#region Audio
+public class AudioSettings
+{
+    public AudioReadSettings? Reading { get; set; }
+}
+
+public class AudioReadSettings : LlmBase
 {
 }
+#endregion