diff --git a/examples/google/embeddings.php b/examples/google/embeddings.php new file mode 100644 index 00000000..9547c850 --- /dev/null +++ b/examples/google/embeddings.php @@ -0,0 +1,36 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\AI\Platform\Bridge\Google\Embeddings; +use Symfony\AI\Platform\Bridge\Google\PlatformFactory; +use Symfony\AI\Platform\Response\VectorResponse; +use Symfony\Component\Dotenv\Dotenv; + +require_once dirname(__DIR__).'/vendor/autoload.php'; +(new Dotenv())->loadEnv(dirname(__DIR__).'/.env'); + +if (empty($_ENV['GOOGLE_API_KEY'])) { + echo 'Please set the GOOGLE_API_KEY environment variable.'.\PHP_EOL; + exit(1); +} + +$platform = PlatformFactory::create($_ENV['GOOGLE_API_KEY']); +$embeddings = new Embeddings(); + +$response = $platform->request($embeddings, <<getContent()[0]->getDimensions().\PHP_EOL; diff --git a/examples/store/mariadb-similarity-search-gemini.php b/examples/store/mariadb-similarity-search-gemini.php new file mode 100644 index 00000000..a3273488 --- /dev/null +++ b/examples/store/mariadb-similarity-search-gemini.php @@ -0,0 +1,85 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Doctrine\DBAL\DriverManager; +use Doctrine\DBAL\Tools\DsnParser; +use PhpLlm\LlmChain\Chain\Chain; +use PhpLlm\LlmChain\Chain\Toolbox\ChainProcessor; +use PhpLlm\LlmChain\Chain\Toolbox\Tool\SimilaritySearch; +use PhpLlm\LlmChain\Chain\Toolbox\Toolbox; +use PhpLlm\LlmChain\Platform\Bridge\Google\Embeddings; +use PhpLlm\LlmChain\Platform\Bridge\Google\Embeddings\TaskType; +use PhpLlm\LlmChain\Platform\Bridge\Google\Gemini; +use PhpLlm\LlmChain\Platform\Bridge\Google\PlatformFactory; +use PhpLlm\LlmChain\Platform\Message\Message; +use PhpLlm\LlmChain\Platform\Message\MessageBag; +use PhpLlm\LlmChain\Store\Bridge\MariaDB\Store; +use PhpLlm\LlmChain\Store\Document\Metadata; +use PhpLlm\LlmChain\Store\Document\TextDocument; +use PhpLlm\LlmChain\Store\Indexer; +use Symfony\Component\Dotenv\Dotenv; +use Symfony\Component\Uid\Uuid; + +require_once dirname(__DIR__, 2).'/vendor/autoload.php'; +(new Dotenv())->loadEnv(dirname(__DIR__, 2).'/.env'); + +if (empty($_ENV['GOOGLE_API_KEY']) || empty($_ENV['MARIADB_URI'])) { + echo 'Please set GOOGLE_API_KEY and MARIADB_URI environment variables.'.\PHP_EOL; + exit(1); +} + +// initialize the store +$store = Store::fromDbal( + connection: DriverManager::getConnection((new DsnParser())->parse($_ENV['MARIADB_URI'])), + tableName: 'my_table', + indexName: 'my_index', + vectorFieldName: 'embedding', +); + +// our data +$movies = [ + ['title' => 'Inception', 'description' => 'A skilled thief is given a chance at redemption if he can successfully perform inception, the act of planting an idea in someone\'s subconscious.', 'director' => 'Christopher Nolan'], + ['title' => 'The Matrix', 'description' => 'A hacker discovers the world he lives in is a simulated reality and joins a rebellion to overthrow its controllers.', 'director' => 'The Wachowskis'], + ['title' => 'The Godfather', 'description' => 'The aging patriarch of an organized crime dynasty transfers control of his empire to his reluctant son.', 'director' => 'Francis Ford Coppola'], +]; + +// create embeddings and documents +foreach ($movies as $i => $movie) { + $documents[] = new TextDocument( + id: Uuid::v4(), + content: 'Title: '.$movie['title'].\PHP_EOL.'Director: '.$movie['director'].\PHP_EOL.'Description: '.$movie['description'], + metadata: new Metadata($movie), + ); +} + +// initialize the table +$store->initialize(['dimensions' => 768]); + +// create embeddings for documents +$platform = PlatformFactory::create($_ENV['GOOGLE_API_KEY']); +$embeddings = new Embeddings(options: ['dimensions' => 768, 'task_type' => TaskType::SemanticSimilarity]); +$indexer = new Indexer($platform, $embeddings, $store); +$indexer->index($documents); + +$model = new Gemini(Gemini::GEMINI_2_FLASH_LITE); + +$similaritySearch = new SimilaritySearch($platform, $embeddings, $store); +$toolbox = Toolbox::create($similaritySearch); +$processor = new ChainProcessor($toolbox); +$chain = new Chain($platform, $model, [$processor], [$processor]); + +$messages = new MessageBag( + Message::forSystem('Please answer all user questions only using SimilaritySearch function.'), + Message::ofUser('Which movie fits the theme of the mafia?') +); +$response = $chain->call($messages); + +echo $response->getContent().\PHP_EOL; diff --git a/src/platform/doc/index.rst b/src/platform/doc/index.rst index bdbb65ef..61d1bb98 100644 --- a/src/platform/doc/index.rst +++ b/src/platform/doc/index.rst @@ -78,6 +78,7 @@ usually defined by the specific models and their documentation. * `Amazon's Nova`_ with `AWS Bedrock`_ as Platform * `Mistral's Mistral`_ with `Mistral`_ as Platform * **Embeddings Models** + * `Google's Text Embeddings`_ with `Google`_ * `OpenAI's Text Embeddings`_ with `OpenAI`_ and `Azure`_ as Platform * `Voyage's Embeddings`_ with `Voyage`_ as Platform * `Mistral Embed`_ with `Mistral`_ as Platform @@ -274,6 +275,7 @@ which can be useful to speed up the processing:: .. _`Amazon's Nova`: https://nova.amazon.com .. _`Mistral's Mistral`: https://www.mistral.ai/ .. _`Mistral`: https://www.mistral.ai/ +.. _`Google's Text Embeddings`: https://ai.google.dev/gemini-api/docs/embeddings .. _`OpenAI's Text Embeddings`: https://platform.openai.com/docs/guides/embeddings/embedding-models .. _`Voyage's Embeddings`: https://docs.voyageai.com/docs/embeddings .. _`Voyage`: https://www.voyageai.com/ diff --git a/src/platform/src/Bridge/Google/Embeddings.php b/src/platform/src/Bridge/Google/Embeddings.php new file mode 100644 index 00000000..ae6a17ff --- /dev/null +++ b/src/platform/src/Bridge/Google/Embeddings.php @@ -0,0 +1,37 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Platform\Bridge\Google; + +use Symfony\AI\Platform\Bridge\Google\Embeddings\TaskType; +use Symfony\AI\Platform\Capability; +use Symfony\AI\Platform\Model; + +/** + * @author Valtteri R + */ +class Embeddings extends Model +{ + /** Supported dimensions: 3072, 1536, or 768 */ + public const GEMINI_EMBEDDING_EXP_03_07 = 'gemini-embedding-exp-03-07'; + /** Fixed 768 dimensions */ + public const TEXT_EMBEDDING_004 = 'text-embedding-004'; + /** Fixed 768 dimensions */ + public const EMBEDDING_001 = 'embedding-001'; + + /** + * @param array{dimensions?: int, task_type?: TaskType|string} $options + */ + public function __construct(string $name = self::GEMINI_EMBEDDING_EXP_03_07, array $options = []) + { + parent::__construct($name, [Capability::INPUT_MULTIPLE], $options); + } +} diff --git a/src/platform/src/Bridge/Google/Embeddings/ModelClient.php b/src/platform/src/Bridge/Google/Embeddings/ModelClient.php new file mode 100644 index 00000000..1fbbb7c0 --- /dev/null +++ b/src/platform/src/Bridge/Google/Embeddings/ModelClient.php @@ -0,0 +1,80 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Platform\Bridge\Google\Embeddings; + +use Symfony\AI\Platform\Bridge\Google\Embeddings; +use Symfony\AI\Platform\Exception\RuntimeException; +use Symfony\AI\Platform\Model; +use Symfony\AI\Platform\ModelClientInterface; +use Symfony\AI\Platform\Response\VectorResponse; +use Symfony\AI\Platform\ResponseConverterInterface; +use Symfony\AI\Platform\Vector\Vector; +use Symfony\Contracts\HttpClient\HttpClientInterface; +use Symfony\Contracts\HttpClient\ResponseInterface; + +/** + * @author Valtteri R + */ +final readonly class ModelClient implements ModelClientInterface, ResponseConverterInterface +{ + public function __construct( + private HttpClientInterface $httpClient, + #[\SensitiveParameter] + private string $apiKey, + ) { + } + + public function supports(Model $model): bool + { + return $model instanceof Embeddings; + } + + public function request(Model $model, array|string $payload, array $options = []): ResponseInterface + { + $url = \sprintf('https://generativelanguage.googleapis.com/v1beta/models/%s:%s', $model->getName(), 'batchEmbedContents'); + $modelOptions = $model->getOptions(); + + return $this->httpClient->request('POST', $url, [ + 'headers' => [ + 'x-goog-api-key' => $this->apiKey, + ], + 'json' => [ + 'requests' => array_map( + static fn (string $text) => array_filter([ + 'model' => 'models/'.$model->getName(), + 'content' => ['parts' => [['text' => $text]]], + 'outputDimensionality' => $modelOptions['dimensions'] ?? null, + 'taskType' => $modelOptions['task_type'] ?? null, + 'title' => $options['title'] ?? null, + ]), + \is_array($payload) ? $payload : [$payload], + ), + ], + ]); + } + + public function convert(ResponseInterface $response, array $options = []): VectorResponse + { + $data = $response->toArray(); + + if (!isset($data['embeddings'])) { + throw new RuntimeException('Response does not contain data'); + } + + return new VectorResponse( + ...array_map( + static fn (array $item): Vector => new Vector($item['values']), + $data['embeddings'], + ), + ); + } +} diff --git a/src/platform/src/Bridge/Google/Embeddings/TaskType.php b/src/platform/src/Bridge/Google/Embeddings/TaskType.php new file mode 100644 index 00000000..fa8e5511 --- /dev/null +++ b/src/platform/src/Bridge/Google/Embeddings/TaskType.php @@ -0,0 +1,34 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Platform\Bridge\Google\Embeddings; + +enum TaskType: string +{ + /** Unset value, which will default to one of the other enum values. */ + public const TaskTypeUnspecified = 'TASK_TYPE_UNSPECIFIED'; + /** Specifies the given text is a query in a search/retrieval setting. */ + public const RetrievalQuery = 'RETRIEVAL_QUERY'; + /** Specifies the given text is a document from the corpus being searched. */ + public const RetrievalDocument = 'RETRIEVAL_DOCUMENT'; + /** Specifies the given text will be used for STS. */ + public const SemanticSimilarity = 'SEMANTIC_SIMILARITY'; + /** Specifies that the given text will be classified. */ + public const Classification = 'CLASSIFICATION'; + /** Specifies that the embeddings will be used for clustering. */ + public const Clustering = 'CLUSTERING'; + /** Specifies that the given text will be used for question answering. */ + public const QuestionAnswering = 'QUESTION_ANSWERING'; + /** Specifies that the given text will be used for fact verification. */ + public const FactVerification = 'FACT_VERIFICATION'; + /** Specifies that the given text will be used for code retrieval. */ + public const CodeRetrievalQuery = 'CODE_RETRIEVAL_QUERY'; +} diff --git a/src/platform/src/Bridge/Google/PlatformFactory.php b/src/platform/src/Bridge/Google/PlatformFactory.php index 49fe8e32..56729767 100644 --- a/src/platform/src/Bridge/Google/PlatformFactory.php +++ b/src/platform/src/Bridge/Google/PlatformFactory.php @@ -16,6 +16,7 @@ use Symfony\AI\Platform\Bridge\Google\Contract\ToolCallMessageNormalizer; use Symfony\AI\Platform\Bridge\Google\Contract\ToolNormalizer; use Symfony\AI\Platform\Bridge\Google\Contract\UserMessageNormalizer; +use Symfony\AI\Platform\Bridge\Google\Embeddings\ModelClient; use Symfony\AI\Platform\Contract; use Symfony\AI\Platform\Platform; use Symfony\Component\HttpClient\EventSourceHttpClient; @@ -33,8 +34,9 @@ public static function create( ): Platform { $httpClient = $httpClient instanceof EventSourceHttpClient ? $httpClient : new EventSourceHttpClient($httpClient); $responseHandler = new ModelHandler($httpClient, $apiKey); + $embeddings = new ModelClient($httpClient, $apiKey); - return new Platform([$responseHandler], [$responseHandler], Contract::create( + return new Platform([$responseHandler, $embeddings], [$responseHandler, $embeddings], Contract::create( new AssistantMessageNormalizer(), new MessageBagNormalizer(), new ToolNormalizer(), diff --git a/src/platform/src/ModelClientInterface.php b/src/platform/src/ModelClientInterface.php index 76b6ae87..b727a867 100644 --- a/src/platform/src/ModelClientInterface.php +++ b/src/platform/src/ModelClientInterface.php @@ -21,8 +21,8 @@ interface ModelClientInterface public function supports(Model $model): bool; /** - * @param array $payload - * @param array $options + * @param array $payload + * @param array $options */ public function request(Model $model, array|string $payload, array $options = []): ResponseInterface; } diff --git a/tests/Platform/Bridge/Google/Embeddings/EmbeddingsModelClientTest.php b/tests/Platform/Bridge/Google/Embeddings/EmbeddingsModelClientTest.php new file mode 100644 index 00000000..49fc8b81 --- /dev/null +++ b/tests/Platform/Bridge/Google/Embeddings/EmbeddingsModelClientTest.php @@ -0,0 +1,102 @@ +createStub(ResponseInterface::class); + $response + ->method('toArray') + ->willReturn(json_decode($this->getEmbeddingStub(), true)); + + $httpClient = self::createMock(HttpClientInterface::class); + $httpClient->expects(self::once()) + ->method('request') + ->with( + 'POST', + 'https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:batchEmbedContents', + [ + 'headers' => ['x-goog-api-key' => 'test'], + 'json' => [ + 'requests' => [ + [ + 'model' => 'models/gemini-embedding-exp-03-07', + 'content' => ['parts' => [['text' => 'payload1']]], + 'outputDimensionality' => 1536, + 'taskType' => 'CLASSIFICATION', + ], + [ + 'model' => 'models/gemini-embedding-exp-03-07', + 'content' => ['parts' => [['text' => 'payload2']]], + 'outputDimensionality' => 1536, + 'taskType' => 'CLASSIFICATION', + ], + ], + ], + ], + ) + ->willReturn($response); + + $model = new Embeddings(Embeddings::GEMINI_EMBEDDING_EXP_03_07, ['dimensions' => 1536, 'task_type' => 'CLASSIFICATION']); + + $httpResponse = (new ModelClient($httpClient, 'test'))->request($model, ['payload1', 'payload2']); + self::assertSame(json_decode($this->getEmbeddingStub(), true), $httpResponse->toArray()); + } + + #[Test] + public function itConvertsAResponseToAVectorResponse(): void + { + $response = $this->createStub(ResponseInterface::class); + $response + ->method('toArray') + ->willReturn(json_decode($this->getEmbeddingStub(), true)); + + $httpClient = self::createMock(HttpClientInterface::class); + + $vectorResponse = (new ModelClient($httpClient, 'test'))->convert($response); + $convertedContent = $vectorResponse->getContent(); + + self::assertCount(2, $convertedContent); + + self::assertSame([0.3, 0.4, 0.4], $convertedContent[0]->getData()); + self::assertSame([0.0, 0.0, 0.2], $convertedContent[1]->getData()); + } + + private function getEmbeddingStub(): string + { + return <<<'JSON' + { + "embeddings": [ + { + "values": [0.3, 0.4, 0.4] + }, + { + "values": [0.0, 0.0, 0.2] + } + ] + } + JSON; + } +}