Skip to content

feat(multimodal): allow to template placeholders #3728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,10 @@ type TemplateConfig struct {
// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
// It defaults to \n
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`

Video string `yaml:"video"`
Image string `yaml:"image"`
Audio string `yaml:"audio"`
}

func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
Expand Down
21 changes: 18 additions & 3 deletions core/http/endpoints/openai/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/functions"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/templates"
"github.com/mudler/LocalAI/pkg/utils"
"github.com/rs/zerolog/log"
)
Expand Down Expand Up @@ -168,8 +169,13 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
continue CONTENT
}
input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff

t := "[vid-{{.ID}}]{{.Text}}"
if config.TemplateConfig.Video != "" {
t = config.TemplateConfig.Video
}
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, vidIndex, input.Messages[i].StringContent)
vidIndex++
case "audio_url", "audio":
// Decode content as base64 either if it's an URL or base64 text
Expand All @@ -180,7 +186,11 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
}
input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[audio-%d]", audioIndex) + input.Messages[i].StringContent
t := "[audio-{{.ID}}]{{.Text}}"
if config.TemplateConfig.Audio != "" {
t = config.TemplateConfig.Audio
}
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, audioIndex, input.Messages[i].StringContent)
audioIndex++
case "image_url", "image":
// Decode content as base64 either if it's an URL or base64 text
Expand All @@ -189,9 +199,14 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
log.Error().Msgf("Failed encoding image: %s", err)
continue CONTENT
}

t := "[img-{{.ID}}]{{.Text}}"
if config.TemplateConfig.Image != "" {
t = config.TemplateConfig.Image
}
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, imgIndex, input.Messages[i].StringContent)
imgIndex++
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/model/initializers.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string

client = NewModel(modelID, serverAddress, process)
} else {
log.Debug().Msg("external backend is uri")
log.Debug().Msg("external backend is a uri")
// address
client = NewModel(modelID, uri, nil)
}
Expand Down
24 changes: 24 additions & 0 deletions pkg/templates/multimodal.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package templates

import (
"bytes"
"text/template"
)

func TemplateMultiModal(templateString string, templateID int, text string) (string, error) {
// compile the template
tmpl, err := template.New("template").Parse(templateString)
if err != nil {
return "", err
}
result := bytes.NewBuffer(nil)
// execute the template
err = tmpl.Execute(result, struct {
ID int
Text string
}{
ID: templateID,
Text: text,
})
return result.String(), err
}
19 changes: 19 additions & 0 deletions pkg/templates/multimodal_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package templates_test

import (
. "github.com/mudler/LocalAI/pkg/templates" // Update with your module path

// Update with your module path
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

var _ = Describe("EvaluateTemplate", func() {
Context("templating simple strings for multimodal chat", func() {
It("should template messages correctly", func() {
result, err := TemplateMultiModal("[img-{{.ID}}]{{.Text}}", 1, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("[img-1]bar"))
})
})
})
Loading