test(ai): Add gemini-2.5-flash to integration tests (#9110)

dlarocque · web-flow · commit 41e3c4cdfc45 · 2025-06-20T15:45:18.000-04:00
Added gemini-2.5-flash to our integration tests now that it's publicly available.

The token counts differ slightly between 2.0-flash and 2.5-flash so I introduced conditionals when checking token counts.
diff --git a/packages/ai/integration/chat.test.ts b/packages/ai/integration/chat.test.ts
@@ -76,56 +76,85 @@ describe('Chat Session', () => {
           'What is the capital of France?'
         );
         const response1 = result1.response;
-        expect(response1.text().trim().toLowerCase()).to.include('paris');
+        const result2 = await chat.sendMessage('And what about Italy?');
+        const response2 = result2.response;
+        const history = await chat.getHistory();
 
-        let history = await chat.getHistory();
-        expect(history.length).to.equal(2);
+        expect(response1.text().trim().toLowerCase()).to.include('paris');
+        expect(response1.usageMetadata).to.not.be.null;
+        expect(response2.text().trim().toLowerCase()).to.include('rome');
+        expect(response2.usageMetadata).to.not.be.null;
+        expect(history.length).to.equal(4);
         expect(history[0].role).to.equal('user');
         expect(history[0].parts[0].text).to.equal(
           'What is the capital of France?'
         );
         expect(history[1].role).to.equal('model');
         expect(history[1].parts[0].text?.toLowerCase()).to.include('paris');
-
-        expect(response1.usageMetadata).to.not.be.null;
-        // Token counts can vary slightly in chat context
-        expect(response1.usageMetadata!.promptTokenCount).to.be.closeTo(
-          15, // "What is the capital of France?" + system instruction
-          TOKEN_COUNT_DELTA + 2 // More variance for chat context
-        );
-        expect(response1.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-          8, // "Paris"
-          TOKEN_COUNT_DELTA
-        );
-        expect(response1.usageMetadata!.totalTokenCount).to.be.closeTo(
-          23, // "What is the capital of France?" + system instruction + "Paris"
-          TOKEN_COUNT_DELTA + 3 // More variance for chat context
-        );
-
-        const result2 = await chat.sendMessage('And what about Italy?');
-        const response2 = result2.response;
-        expect(response2.text().trim().toLowerCase()).to.include('rome');
-
-        history = await chat.getHistory();
-        expect(history.length).to.equal(4);
         expect(history[2].role).to.equal('user');
         expect(history[2].parts[0].text).to.equal('And what about Italy?');
         expect(history[3].role).to.equal('model');
         expect(history[3].parts[0].text?.toLowerCase()).to.include('rome');
 
-        expect(response2.usageMetadata).to.not.be.null;
-        expect(response2.usageMetadata!.promptTokenCount).to.be.closeTo(
-          28, // History + "And what about Italy?" + system instruction
-          TOKEN_COUNT_DELTA + 5 // More variance for chat context with history
-        );
-        expect(response2.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-          8,
-          TOKEN_COUNT_DELTA
-        );
-        expect(response2.usageMetadata!.totalTokenCount).to.be.closeTo(
-          36,
-          TOKEN_COUNT_DELTA
-        );
+        if (model.model.includes('gemini-2.5-flash')) {
+          // Token counts can vary slightly in chat context
+          expect(response1.usageMetadata!.promptTokenCount).to.be.closeTo(
+            17, // "What is the capital of France?" + system instruction
+            TOKEN_COUNT_DELTA + 2 // More variance for chat context
+          );
+          expect(response1.usageMetadata!.candidatesTokenCount).to.be.closeTo(
+            8, // "Paris"
+            TOKEN_COUNT_DELTA
+          );
+          expect(response1.usageMetadata!.totalTokenCount).to.be.closeTo(
+            49, // "What is the capital of France?" + system instruction + "Paris"
+            TOKEN_COUNT_DELTA + 3 // More variance for chat context
+          );
+          expect(response1.usageMetadata!.totalTokenCount).to.be.closeTo(
+            49, // "What is the capital of France?" + system instruction + "Paris"
+            TOKEN_COUNT_DELTA + 3 // More variance for chat context
+          );
+
+          expect(response2.usageMetadata!.promptTokenCount).to.be.closeTo(
+            32, // History + "And what about Italy?" + system instruction
+            TOKEN_COUNT_DELTA + 5 // More variance for chat context with history
+          );
+          expect(response2.usageMetadata!.candidatesTokenCount).to.be.closeTo(
+            8,
+            TOKEN_COUNT_DELTA
+          );
+          expect(response2.usageMetadata!.totalTokenCount).to.be.closeTo(
+            68,
+            TOKEN_COUNT_DELTA + 2
+          );
+        } else if (model.model.includes('gemini-2.0-flash')) {
+          expect(response1.usageMetadata).to.not.be.null;
+          // Token counts can vary slightly in chat context
+          expect(response1.usageMetadata!.promptTokenCount).to.be.closeTo(
+            15, // "What is the capital of France?" + system instruction
+            TOKEN_COUNT_DELTA + 2 // More variance for chat context
+          );
+          expect(response1.usageMetadata!.candidatesTokenCount).to.be.closeTo(
+            8, // "Paris"
+            TOKEN_COUNT_DELTA
+          );
+          expect(response1.usageMetadata!.totalTokenCount).to.be.closeTo(
+            23, // "What is the capital of France?" + system instruction + "Paris"
+            TOKEN_COUNT_DELTA + 3 // More variance for chat context
+          );
+          expect(response2.usageMetadata!.promptTokenCount).to.be.closeTo(
+            28, // History + "And what about Italy?" + system instruction
+            TOKEN_COUNT_DELTA + 5 // More variance for chat context with history
+          );
+          expect(response2.usageMetadata!.candidatesTokenCount).to.be.closeTo(
+            8,
+            TOKEN_COUNT_DELTA
+          );
+          expect(response2.usageMetadata!.totalTokenCount).to.be.closeTo(
+            36,
+            TOKEN_COUNT_DELTA
+          );
+        }
       });
     });
   });
diff --git a/packages/ai/integration/constants.ts b/packages/ai/integration/constants.ts
@@ -52,7 +52,7 @@ const backendNames: Map<BackendType, string> = new Map([
   [BackendType.VERTEX_AI, 'Vertex AI']
 ]);
 
-const modelNames: readonly string[] = ['gemini-2.0-flash'];
+const modelNames: readonly string[] = ['gemini-2.0-flash', 'gemini-2.5-flash'];
 
 /**
  * Array of test configurations that is iterated over to get full coverage
diff --git a/packages/ai/integration/generate-content.test.ts b/packages/ai/integration/generate-content.test.ts
@@ -81,36 +81,67 @@ describe('Generate Content', () => {
         expect(trimmedText).to.equal('Mountain View');
 
         expect(response.usageMetadata).to.not.be.null;
-        expect(response.usageMetadata!.promptTokenCount).to.be.closeTo(
-          21,
-          TOKEN_COUNT_DELTA
-        );
-        expect(response.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-          4,
-          TOKEN_COUNT_DELTA
-        );
-        expect(response.usageMetadata!.totalTokenCount).to.be.closeTo(
-          25,
-          TOKEN_COUNT_DELTA * 2
-        );
-        expect(response.usageMetadata!.promptTokensDetails).to.not.be.null;
-        expect(response.usageMetadata!.promptTokensDetails!.length).to.equal(1);
-        expect(
-          response.usageMetadata!.promptTokensDetails![0].modality
-        ).to.equal(Modality.TEXT);
-        expect(
-          response.usageMetadata!.promptTokensDetails![0].tokenCount
-        ).to.equal(21);
-        expect(response.usageMetadata!.candidatesTokensDetails).to.not.be.null;
-        expect(
-          response.usageMetadata!.candidatesTokensDetails!.length
-        ).to.equal(1);
-        expect(
-          response.usageMetadata!.candidatesTokensDetails![0].modality
-        ).to.equal(Modality.TEXT);
-        expect(
-          response.usageMetadata!.candidatesTokensDetails![0].tokenCount
-        ).to.be.closeTo(4, TOKEN_COUNT_DELTA);
+
+        if (model.model.includes('gemini-2.5-flash')) {
+          expect(response.usageMetadata!.promptTokenCount).to.be.closeTo(
+            22,
+            TOKEN_COUNT_DELTA
+          );
+          expect(response.usageMetadata!.candidatesTokenCount).to.be.closeTo(
+            2,
+            TOKEN_COUNT_DELTA
+          );
+          expect(response.usageMetadata!.totalTokenCount).to.be.closeTo(
+            55,
+            TOKEN_COUNT_DELTA * 2
+          );
+          expect(response.usageMetadata!.promptTokensDetails).to.not.be.null;
+          expect(response.usageMetadata!.promptTokensDetails!.length).to.equal(
+            1
+          );
+          expect(
+            response.usageMetadata!.promptTokensDetails![0].modality
+          ).to.equal(Modality.TEXT);
+          expect(
+            response.usageMetadata!.promptTokensDetails![0].tokenCount
+          ).to.closeTo(22, TOKEN_COUNT_DELTA);
+
+          // candidatesTokenDetails comes back about half the time, so let's just not test it.
+        } else if (model.model.includes('gemini-2.0-flash')) {
+          expect(response.usageMetadata!.promptTokenCount).to.be.closeTo(
+            21,
+            TOKEN_COUNT_DELTA
+          );
+          expect(response.usageMetadata!.candidatesTokenCount).to.be.closeTo(
+            4,
+            TOKEN_COUNT_DELTA
+          );
+          expect(response.usageMetadata!.totalTokenCount).to.be.closeTo(
+            25,
+            TOKEN_COUNT_DELTA * 2
+          );
+          expect(response.usageMetadata!.promptTokensDetails).to.not.be.null;
+          expect(response.usageMetadata!.promptTokensDetails!.length).to.equal(
+            1
+          );
+          expect(
+            response.usageMetadata!.promptTokensDetails![0].modality
+          ).to.equal(Modality.TEXT);
+          expect(
+            response.usageMetadata!.promptTokensDetails![0].tokenCount
+          ).to.equal(21);
+          expect(response.usageMetadata!.candidatesTokensDetails).to.not.be
+            .null;
+          expect(
+            response.usageMetadata!.candidatesTokensDetails!.length
+          ).to.equal(1);
+          expect(
+            response.usageMetadata!.candidatesTokensDetails![0].modality
+          ).to.equal(Modality.TEXT);
+          expect(
+            response.usageMetadata!.candidatesTokensDetails![0].tokenCount
+          ).to.be.closeTo(4, TOKEN_COUNT_DELTA);
+        }
       });
 
       it('generateContentStream: text input, text output', async () => {