From 7c50482ed66060b4a6863bffaa931199b5785f21 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 15:29:09 +0100
Subject: [PATCH 01/51] add JSON schema spec for audio-classification

---
 .../audio-classification/spec/input.json      | 30 +++++++++++++++++++
 .../audio-classification/spec/output.json     | 24 +++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 packages/tasks/src/tasks/audio-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/audio-classification/spec/output.json

diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
new file mode 100644
index 0000000000..b0b8757b17
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -0,0 +1,30 @@
+{
+    "id": "http://huggingface.co/inference/schemas/audio-classification/input.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Inputs for Audio Classification inference",
+    "type": "object",
+    "properties": {
+        "inputs": {
+            "description": "On or several audio files to classify"
+        },
+        "parameters": {
+            "description": "Additional inference parameters",
+            "$ref": "#/definitions/AudioClassificationParameters"
+        }
+    },
+    "definitions": {
+        "AudioClassificationParameters": {
+            "description": "Additional inference parameters for Audio Classification",
+            "type": "object",
+            "properties": {
+                "topK": {
+                    "type": "integer",
+                    "description": "When specified, limits the output to the top K most probable classes."
+                }
+            }
+        }
+    },
+    "required": [
+        "inputs"
+    ]
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
new file mode 100644
index 0000000000..dfe0f0c5e1
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -0,0 +1,24 @@
+{
+    "id": "http://huggingface.co/inference/schemas/audio-classification/output.schema.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "title": "AudioClassificationOutput",
+    "description": "Outputs for Audio Classification inference",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "label": {
+                "type": "string",
+                "description": "The predicted class label (model specific)."
+            },
+            "score": {
+                "type": "number",
+                "description": "The corresponding probability."
+            }
+        },
+        "required": [
+            "label",
+            "score"
+        ]
+    }
+}
\ No newline at end of file

From fd98112b14fe8d7c89fe4a386dd938509bb37f7a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 15:31:29 +0100
Subject: [PATCH 02/51] add JSON schema spec for text-generation

---
 .../src/tasks/text-generation/spec/input.json | 84 +++++++++++++++++++
 .../tasks/text-generation/spec/output.json    | 18 ++++
 2 files changed, 102 insertions(+)
 create mode 100644 packages/tasks/src/tasks/text-generation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-generation/spec/output.json

diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
new file mode 100644
index 0000000000..e0e73dd682
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -0,0 +1,84 @@
+{
+    "id": "http://huggingface.co/inference/schemas/text-generation/input.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Inputs for Text Generation inference",
+    "type": "object",
+    "properties": {
+        "inputs": {
+            "description": "The text to initialize generation with",
+            "anyOf": [
+                {
+                    "type": "string"
+                },
+                {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            ]
+        },
+        "parameters": {
+            "description": "Additional inference parameters",
+            "$ref": "#/definitions/TextGenerationParameters"
+        }
+    },
+    "definitions": {
+        "TextGenerationParameters": {
+            "description": "Additional inference parameters for Text Generation",
+            "type": "object",
+            "properties": {
+                "doSample": {
+                    "type": "boolean",
+                    "description": "Whether to use logit sampling (true) or greedy search (false)."
+                },
+                "maxNewTokens": {
+                    "type": "integer",
+                    "description": "Maximum number of generated tokens."
+                },
+                "repetitionPenalty": {
+                    "type": "number",
+                    "description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
+                },
+                "returnFullText": {
+                    "type": "boolean",
+                    "description": "Whether to prepend the prompt to the generated text."
+                },
+                "stopSequences": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "Stop generating tokens if a member of `stop_sequences` is generated."
+                },
+                "temperature": {
+                    "type": "number",
+                    "description": "The value used to modulate the logits distribution."
+                },
+                "topK": {
+                    "type": "integer",
+                    "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+                },
+                "topP": {
+                    "type": "number",
+                    "description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
+                },
+                "truncate": {
+                    "type": "integer",
+                    "description": "Truncate input tokens to the given size."
+                },
+                "typicalP": {
+                    "type": "number",
+                    "description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
+                },
+                "watermark": {
+                    "type": "boolean",
+                    "description": "Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)"
+                }
+            }
+        }
+    },
+    "required": [
+        "inputs"
+    ]
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
new file mode 100644
index 0000000000..1636539421
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -0,0 +1,18 @@
+{
+    "id": "http://huggingface.co/inference/schemas/text-generation/output.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Outputs for Text Generation inference",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "generatedText": {
+                "type": "string",
+                "description": "The generated text"
+            }
+        },
+        "required": [
+            "generatedText"
+        ]
+    }
+}
\ No newline at end of file

From 352e7c5f2fb75a6585f0f684ba940f69a3ebffd6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 16:00:07 +0100
Subject: [PATCH 03/51] =?UTF-8?q?=E2=9C=A8=20Add=20script=20to=20generate?=
 =?UTF-8?q?=20inference=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/package.json                   |  12 +-
 packages/tasks/pnpm-lock.yaml                 | 207 ++++++++++++++++++
 .../tasks/src/scripts/inference-codegen.ts    | 103 +++++++++
 3 files changed, 318 insertions(+), 4 deletions(-)
 create mode 100644 packages/tasks/src/scripts/inference-codegen.ts

diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index 0917d79756..d538c64874 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -24,9 +24,10 @@
 		"format": "prettier --write .",
 		"format:check": "prettier --check .",
 		"prepublishOnly": "pnpm run build",
-		"build": "tsup src/index.ts --format cjs,esm --clean --dts",
+		"build": "tsup src/index.ts src/scripts/**.ts --format cjs,esm --clean --dts",
 		"prepare": "pnpm run build",
-		"check": "tsc"
+		"check": "tsc",
+		"inference-codegen": "pnpm run build && node dist/scripts/inference-codegen.js"
 	},
 	"files": [
 		"dist",
@@ -40,5 +41,8 @@
 	],
 	"author": "Hugging Face",
 	"license": "MIT",
-	"devDependencies": {}
-}
+	"devDependencies": {
+		"@types/node": "^20.11.5",
+		"quicktype-core": "^23.0.81"
+	}
+}
\ No newline at end of file
diff --git a/packages/tasks/pnpm-lock.yaml b/packages/tasks/pnpm-lock.yaml
index 2b9f1883a1..fedbbb7c3b 100644
--- a/packages/tasks/pnpm-lock.yaml
+++ b/packages/tasks/pnpm-lock.yaml
@@ -3,3 +3,210 @@ lockfileVersion: '6.0'
 settings:
   autoInstallPeers: true
   excludeLinksFromLockfile: false
+
+devDependencies:
+  '@types/node':
+    specifier: ^20.11.5
+    version: 20.11.5
+  quicktype-core:
+    specifier: ^23.0.81
+    version: 23.0.81
+
+packages:
+
+  /@glideapps/ts-necessities@2.1.3:
+    resolution: {integrity: sha512-q9U8v/n9qbkd2zDYjuX3qtlbl+OIyI9zF+zQhZjfYOE9VMDH7tfcUSJ9p0lXoY3lxmGFne09yi4iiNeQUwV7AA==}
+    dev: true
+
+  /@types/node@20.11.5:
+    resolution: {integrity: sha512-g557vgQjUUfN76MZAN/dt1z3dzcUsimuysco0KeluHgrPdJXkP/XdAURgyO2W9fZWHRtRBiVKzKn8vyOAwlG+w==}
+    dependencies:
+      undici-types: 5.26.5
+    dev: true
+
+  /@types/urijs@1.19.25:
+    resolution: {integrity: sha512-XOfUup9r3Y06nFAZh3WvO0rBU4OtlfPB/vgxpjg+NRdGU6CN6djdc6OEiH+PcqHCY6eFLo9Ista73uarf4gnBg==}
+    dev: true
+
+  /abort-controller@3.0.0:
+    resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==}
+    engines: {node: '>=6.5'}
+    dependencies:
+      event-target-shim: 5.0.1
+    dev: true
+
+  /base64-js@1.5.1:
+    resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
+    dev: true
+
+  /browser-or-node@2.1.1:
+    resolution: {integrity: sha512-8CVjaLJGuSKMVTxJ2DpBl5XnlNDiT4cQFeuCJJrvJmts9YrTZDizTX7PjC2s6W4x+MBGZeEY6dGMrF04/6Hgqg==}
+    dev: true
+
+  /buffer@6.0.3:
+    resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==}
+    dependencies:
+      base64-js: 1.5.1
+      ieee754: 1.2.1
+    dev: true
+
+  /collection-utils@1.0.1:
+    resolution: {integrity: sha512-LA2YTIlR7biSpXkKYwwuzGjwL5rjWEZVOSnvdUc7gObvWe4WkjxOpfrdhoP7Hs09YWDVfg0Mal9BpAqLfVEzQg==}
+    dev: true
+
+  /cross-fetch@4.0.0:
+    resolution: {integrity: sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==}
+    dependencies:
+      node-fetch: 2.7.0
+    transitivePeerDependencies:
+      - encoding
+    dev: true
+
+  /event-target-shim@5.0.1:
+    resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==}
+    engines: {node: '>=6'}
+    dev: true
+
+  /events@3.3.0:
+    resolution: {integrity: sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==}
+    engines: {node: '>=0.8.x'}
+    dev: true
+
+  /ieee754@1.2.1:
+    resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
+    dev: true
+
+  /is-url@1.2.4:
+    resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==}
+    dev: true
+
+  /js-base64@3.7.5:
+    resolution: {integrity: sha512-3MEt5DTINKqfScXKfJFrRbxkrnk2AxPWGBL/ycjz4dK8iqiSJ06UxD8jh8xuh6p10TX4t2+7FsBYVxxQbMg+qA==}
+    dev: true
+
+  /lodash@4.17.21:
+    resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==}
+    dev: true
+
+  /node-fetch@2.7.0:
+    resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
+    engines: {node: 4.x || >=6.0.0}
+    peerDependencies:
+      encoding: ^0.1.0
+    peerDependenciesMeta:
+      encoding:
+        optional: true
+    dependencies:
+      whatwg-url: 5.0.0
+    dev: true
+
+  /pako@0.2.9:
+    resolution: {integrity: sha512-NUcwaKxUxWrZLpDG+z/xZaCgQITkA/Dv4V/T6bw7VON6l1Xz/VnrBqrYjZQ12TamKHzITTfOEIYUj48y2KXImA==}
+    dev: true
+
+  /pako@1.0.11:
+    resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==}
+    dev: true
+
+  /pluralize@8.0.0:
+    resolution: {integrity: sha512-Nc3IT5yHzflTfbjgqWcCPpo7DaKy4FnpB0l/zCAW0Tc7jxAiuqSxHasntB3D7887LSrA93kDJ9IXovxJYxyLCA==}
+    engines: {node: '>=4'}
+    dev: true
+
+  /process@0.11.10:
+    resolution: {integrity: sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==}
+    engines: {node: '>= 0.6.0'}
+    dev: true
+
+  /quicktype-core@23.0.81:
+    resolution: {integrity: sha512-iJQpCEzSQIkffJPS5NC+0w+Rq9faGgz09L+WIbseu1toFfj+M/3KTG5jhzdY/uN88fWosAom2fMoEADA403+rQ==}
+    dependencies:
+      '@glideapps/ts-necessities': 2.1.3
+      '@types/urijs': 1.19.25
+      browser-or-node: 2.1.1
+      collection-utils: 1.0.1
+      cross-fetch: 4.0.0
+      is-url: 1.2.4
+      js-base64: 3.7.5
+      lodash: 4.17.21
+      pako: 1.0.11
+      pluralize: 8.0.0
+      readable-stream: 4.4.2
+      unicode-properties: 1.4.1
+      urijs: 1.19.11
+      wordwrap: 1.0.0
+      yaml: 2.3.4
+    transitivePeerDependencies:
+      - encoding
+    dev: true
+
+  /readable-stream@4.4.2:
+    resolution: {integrity: sha512-Lk/fICSyIhodxy1IDK2HazkeGjSmezAWX2egdtJnYhtzKEsBPJowlI6F6LPb5tqIQILrMbx22S5o3GuJavPusA==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dependencies:
+      abort-controller: 3.0.0
+      buffer: 6.0.3
+      events: 3.3.0
+      process: 0.11.10
+      string_decoder: 1.3.0
+    dev: true
+
+  /safe-buffer@5.2.1:
+    resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
+    dev: true
+
+  /string_decoder@1.3.0:
+    resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
+    dependencies:
+      safe-buffer: 5.2.1
+    dev: true
+
+  /tiny-inflate@1.0.3:
+    resolution: {integrity: sha512-pkY1fj1cKHb2seWDy0B16HeWyczlJA9/WW3u3c4z/NiWDsO3DOU5D7nhTLE9CF0yXv/QZFY7sEJmj24dK+Rrqw==}
+    dev: true
+
+  /tr46@0.0.3:
+    resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==}
+    dev: true
+
+  /undici-types@5.26.5:
+    resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
+    dev: true
+
+  /unicode-properties@1.4.1:
+    resolution: {integrity: sha512-CLjCCLQ6UuMxWnbIylkisbRj31qxHPAurvena/0iwSVbQ2G1VY5/HjV0IRabOEbDHlzZlRdCrD4NhB0JtU40Pg==}
+    dependencies:
+      base64-js: 1.5.1
+      unicode-trie: 2.0.0
+    dev: true
+
+  /unicode-trie@2.0.0:
+    resolution: {integrity: sha512-x7bc76x0bm4prf1VLg79uhAzKw8DVboClSN5VxJuQ+LKDOVEW9CdH+VY7SP+vX7xCYQqzzgQpFqz15zeLvAtZQ==}
+    dependencies:
+      pako: 0.2.9
+      tiny-inflate: 1.0.3
+    dev: true
+
+  /urijs@1.19.11:
+    resolution: {integrity: sha512-HXgFDgDommxn5/bIv0cnQZsPhHDA90NPHD6+c/v21U5+Sx5hoP8+dP9IZXBU1gIfvdRfhG8cel9QNPeionfcCQ==}
+    dev: true
+
+  /webidl-conversions@3.0.1:
+    resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==}
+    dev: true
+
+  /whatwg-url@5.0.0:
+    resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==}
+    dependencies:
+      tr46: 0.0.3
+      webidl-conversions: 3.0.1
+    dev: true
+
+  /wordwrap@1.0.0:
+    resolution: {integrity: sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==}
+    dev: true
+
+  /yaml@2.3.4:
+    resolution: {integrity: sha512-8aAvwVUSHpfEqTQ4w/KMlf3HcRdt50E5ODIQJBw1fQ5RL34xabzxtUlzTXVqc4rkZsPbvrXKWnABCD7kWSmocA==}
+    engines: {node: '>= 14'}
+    dev: true
diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
new file mode 100644
index 0000000000..5aef9fcd31
--- /dev/null
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -0,0 +1,103 @@
+import type {
+    SerializedRenderResult
+} from "quicktype-core";
+import {
+    quicktype,
+    InputData,
+    JSONSchemaInput,
+    FetchingJSONSchemaStore
+
+} from "quicktype-core";
+import * as fs from "fs/promises";
+import { existsSync as pathExists } from "fs";
+import * as path from "path";
+
+const TYPESCRIPT_HEADER_FILE = `
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ * 
+ * Generated on ${new Date().toISOString()}
+ */
+
+`
+
+
+const rootDirFinder = function (): string {
+    const parts = __dirname.split("/");
+    let level = parts.length - 1;
+    while (level > 0) {
+        const currentPath = parts.slice(0, level).join("/");
+        console.debug(currentPath);
+        try {
+            require(`${currentPath}/package.json`);
+            return path.normalize(currentPath);
+        } catch (err) {
+            /// noop
+        }
+        level--;
+    }
+    return "";
+};
+
+async function buildInputData(taskId: string, taskSpecDir: string): Promise<InputData> {
+    const schema = new JSONSchemaInput(new FetchingJSONSchemaStore());
+    await schema.addSource({ name: `${taskId}-input`, schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }) });
+    await schema.addSource({ name: `${taskId}-output`, schema: await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }) });
+    const inputData = new InputData();
+    inputData.addInput(schema);
+    return inputData;
+}
+
+
+async function generateTypescript(inputData: InputData): Promise<SerializedRenderResult> {
+    return await quicktype({
+        inputData,
+        lang: "typescript",
+        alphabetizeProperties: true,
+        rendererOptions: {
+            "just-types": true,
+            "nice-property-names": true,
+            "prefer-unions": true,
+            "prefer-const-values": true,
+        }
+    });
+}
+
+async function main() {
+    const rootDir = rootDirFinder();
+    const tasksDir = path.join(rootDir, "src", "tasks")
+    const allTasks = await Promise.all(
+        (await fs.readdir(tasksDir, { withFileTypes: true }))
+            .filter(entry => entry.isDirectory())
+            .map(async entry => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
+    );
+
+    for (const { task, dirPath } of allTasks) {
+        const taskSpecDir = path.join(dirPath, "spec")
+        if (!pathExists(taskSpecDir)) {
+            console.debug(`No spec found for task ${task} - skipping`);
+            continue
+        }
+        console.debug(`✨ Generating types for task`, task)
+
+        console.debug("   📦 Building input data")
+        const inputData = await buildInputData(task, taskSpecDir);
+
+        console.debug("   🏭 Generating typescript code")
+        {
+            const { lines } = await generateTypescript(inputData);
+            await fs.writeFile(`${dirPath}/inference.ts`, [TYPESCRIPT_HEADER_FILE, ...lines].join(`\n`), { flag: "w+", encoding: "utf-8" });
+        }
+
+
+    }
+    console.debug("✅ All done!")
+}
+
+let exit = 0;
+main()
+    .catch(err => {
+        console.error("Failure", err);
+        exit = 1;
+    })
+    .finally(() => process.exit(exit));
\ No newline at end of file

From 5551f5b98263f1f0457095fbb9b7606fc6317f64 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 16:00:33 +0100
Subject: [PATCH 04/51] Add generated code

---
 .../tasks/audio-classification/inference.ts   | 50 +++++++++++
 .../src/tasks/text-generation/inference.ts    | 89 +++++++++++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 packages/tasks/src/tasks/audio-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/text-generation/inference.ts

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
new file mode 100644
index 0000000000..ea5e9076ee
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -0,0 +1,50 @@
+
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ * 
+ * Generated on 2024-01-19T14:59:10.562Z
+ */
+
+
+/**
+ * Inputs for Audio Classification inference
+ */
+export interface AudioClassificationInput {
+    /**
+     * On or several audio files to classify
+     */
+    inputs: any;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: AudioClassificationParameters;
+    [property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Audio Classification
+ */
+export interface AudioClassificationParameters {
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: any;
+}
+
+/**
+ * Outputs for Audio Classification inference
+ */
+export interface AudioClassificationOutput {
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
new file mode 100644
index 0000000000..ef9661482f
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -0,0 +1,89 @@
+
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ * 
+ * Generated on 2024-01-19T14:59:10.562Z
+ */
+
+
+/**
+ * Inputs for Text Generation inference
+ */
+export interface TextGenerationInput {
+    /**
+     * The text to initialize generation with
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextGenerationParameters;
+    [property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Generation
+ */
+export interface TextGenerationParameters {
+    /**
+     * Whether to use logit sampling (true) or greedy search (false).
+     */
+    doSample?: boolean;
+    /**
+     * Maximum number of generated tokens.
+     */
+    maxNewTokens?: number;
+    /**
+     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+     * paper](https://hf.co/papers/1909.05858) for more details.
+     */
+    repetitionPenalty?: number;
+    /**
+     * Whether to prepend the prompt to the generated text.
+     */
+    returnFullText?: boolean;
+    /**
+     * Stop generating tokens if a member of `stop_sequences` is generated.
+     */
+    stopSequences?: string[];
+    /**
+     * The value used to modulate the logits distribution.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    topK?: number;
+    /**
+     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+     * up to `top_p` or higher are kept for generation.
+     */
+    topP?: number;
+    /**
+     * Truncate input tokens to the given size.
+     */
+    truncate?: number;
+    /**
+     * Typical Decoding mass. See [Typical Decoding for Natural Language
+     * Generation](https://hf.co/papers/2202.00666) for more information
+     */
+    typicalP?: number;
+    /**
+     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+     */
+    watermark?: boolean;
+    [property: string]: any;
+}
+
+/**
+ * Outputs for Text Generation inference
+ */
+export interface TextGenerationOutput {
+    /**
+     * The generated text
+     */
+    generatedText: string;
+    [property: string]: any;
+}

From fad594b648e1b61fadcb1a0f97aa0028c2697b90 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 16:04:07 +0100
Subject: [PATCH 05/51] =?UTF-8?q?=F0=9F=92=84format=20with=20pnpm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/package.json                   |   2 +-
 .../tasks/src/scripts/inference-codegen.ts    | 145 ++++++++--------
 .../tasks/audio-classification/inference.ts   |  50 +++---
 .../audio-classification/spec/input.json      |  56 +++---
 .../audio-classification/spec/output.json     |  43 +++--
 .../src/tasks/text-generation/inference.ts    | 128 +++++++-------
 .../src/tasks/text-generation/spec/input.json | 164 +++++++++---------
 .../tasks/text-generation/spec/output.json    |  32 ++--
 8 files changed, 302 insertions(+), 318 deletions(-)

diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index d538c64874..258679abaf 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -45,4 +45,4 @@
 		"@types/node": "^20.11.5",
 		"quicktype-core": "^23.0.81"
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 5aef9fcd31..6edc31fea7 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -1,13 +1,5 @@
-import type {
-    SerializedRenderResult
-} from "quicktype-core";
-import {
-    quicktype,
-    InputData,
-    JSONSchemaInput,
-    FetchingJSONSchemaStore
-
-} from "quicktype-core";
+import type { SerializedRenderResult } from "quicktype-core";
+import { quicktype, InputData, JSONSchemaInput, FetchingJSONSchemaStore } from "quicktype-core";
 import * as fs from "fs/promises";
 import { existsSync as pathExists } from "fs";
 import * as path from "path";
@@ -19,85 +11,90 @@ const TYPESCRIPT_HEADER_FILE = `
  * Generated on ${new Date().toISOString()}
  */
 
-`
-
+`;
 
 const rootDirFinder = function (): string {
-    const parts = __dirname.split("/");
-    let level = parts.length - 1;
-    while (level > 0) {
-        const currentPath = parts.slice(0, level).join("/");
-        console.debug(currentPath);
-        try {
-            require(`${currentPath}/package.json`);
-            return path.normalize(currentPath);
-        } catch (err) {
-            /// noop
-        }
-        level--;
-    }
-    return "";
+	const parts = __dirname.split("/");
+	let level = parts.length - 1;
+	while (level > 0) {
+		const currentPath = parts.slice(0, level).join("/");
+		console.debug(currentPath);
+		try {
+			require(`${currentPath}/package.json`);
+			return path.normalize(currentPath);
+		} catch (err) {
+			/// noop
+		}
+		level--;
+	}
+	return "";
 };
 
 async function buildInputData(taskId: string, taskSpecDir: string): Promise<InputData> {
-    const schema = new JSONSchemaInput(new FetchingJSONSchemaStore());
-    await schema.addSource({ name: `${taskId}-input`, schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }) });
-    await schema.addSource({ name: `${taskId}-output`, schema: await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }) });
-    const inputData = new InputData();
-    inputData.addInput(schema);
-    return inputData;
+	const schema = new JSONSchemaInput(new FetchingJSONSchemaStore());
+	await schema.addSource({
+		name: `${taskId}-input`,
+		schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }),
+	});
+	await schema.addSource({
+		name: `${taskId}-output`,
+		schema: await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }),
+	});
+	const inputData = new InputData();
+	inputData.addInput(schema);
+	return inputData;
 }
 
-
 async function generateTypescript(inputData: InputData): Promise<SerializedRenderResult> {
-    return await quicktype({
-        inputData,
-        lang: "typescript",
-        alphabetizeProperties: true,
-        rendererOptions: {
-            "just-types": true,
-            "nice-property-names": true,
-            "prefer-unions": true,
-            "prefer-const-values": true,
-        }
-    });
+	return await quicktype({
+		inputData,
+		lang: "typescript",
+		alphabetizeProperties: true,
+		rendererOptions: {
+			"just-types": true,
+			"nice-property-names": true,
+			"prefer-unions": true,
+			"prefer-const-values": true,
+		},
+	});
 }
 
 async function main() {
-    const rootDir = rootDirFinder();
-    const tasksDir = path.join(rootDir, "src", "tasks")
-    const allTasks = await Promise.all(
-        (await fs.readdir(tasksDir, { withFileTypes: true }))
-            .filter(entry => entry.isDirectory())
-            .map(async entry => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
-    );
-
-    for (const { task, dirPath } of allTasks) {
-        const taskSpecDir = path.join(dirPath, "spec")
-        if (!pathExists(taskSpecDir)) {
-            console.debug(`No spec found for task ${task} - skipping`);
-            continue
-        }
-        console.debug(`✨ Generating types for task`, task)
-
-        console.debug("   📦 Building input data")
-        const inputData = await buildInputData(task, taskSpecDir);
+	const rootDir = rootDirFinder();
+	const tasksDir = path.join(rootDir, "src", "tasks");
+	const allTasks = await Promise.all(
+		(await fs.readdir(tasksDir, { withFileTypes: true }))
+			.filter((entry) => entry.isDirectory())
+			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
+	);
 
-        console.debug("   🏭 Generating typescript code")
-        {
-            const { lines } = await generateTypescript(inputData);
-            await fs.writeFile(`${dirPath}/inference.ts`, [TYPESCRIPT_HEADER_FILE, ...lines].join(`\n`), { flag: "w+", encoding: "utf-8" });
-        }
+	for (const { task, dirPath } of allTasks) {
+		const taskSpecDir = path.join(dirPath, "spec");
+		if (!pathExists(taskSpecDir)) {
+			console.debug(`No spec found for task ${task} - skipping`);
+			continue;
+		}
+		console.debug(`✨ Generating types for task`, task);
 
+		console.debug("   📦 Building input data");
+		const inputData = await buildInputData(task, taskSpecDir);
 
-    }
-    console.debug("✅ All done!")
+		console.debug("   🏭 Generating typescript code");
+		{
+			const { lines } = await generateTypescript(inputData);
+			await fs.writeFile(`${dirPath}/inference.ts`, [TYPESCRIPT_HEADER_FILE, ...lines].join(`\n`), {
+				flag: "w+",
+				encoding: "utf-8",
+			});
+		}
+	}
+	console.debug("✅ All done!");
 }
 
 let exit = 0;
 main()
-    .catch(err => {
-        console.error("Failure", err);
-        exit = 1;
-    })
-    .finally(() => process.exit(exit));
\ No newline at end of file
+	.catch((err) => {
+		console.error("Failure", err);
+		exit = 1;
+	})
+	.finally(() => process.exit(exit));
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index ea5e9076ee..aa0e4e86cd 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Generated on 2024-01-19T14:59:10.562Z
  */
 
-
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-    /**
-     * On or several audio files to classify
-     */
-    inputs: any;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: AudioClassificationParameters;
-    [property: string]: any;
+	/**
+	 * On or several audio files to classify
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: AudioClassificationParameters;
+	[property: string]: any;
 }
 
 /**
@@ -27,24 +25,24 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: any;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
 }
 
 /**
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: any;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: any;
 }
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index b0b8757b17..f2f3fbfbf8 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -1,30 +1,28 @@
 {
-    "id": "http://huggingface.co/inference/schemas/audio-classification/input.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Inputs for Audio Classification inference",
-    "type": "object",
-    "properties": {
-        "inputs": {
-            "description": "On or several audio files to classify"
-        },
-        "parameters": {
-            "description": "Additional inference parameters",
-            "$ref": "#/definitions/AudioClassificationParameters"
-        }
-    },
-    "definitions": {
-        "AudioClassificationParameters": {
-            "description": "Additional inference parameters for Audio Classification",
-            "type": "object",
-            "properties": {
-                "topK": {
-                    "type": "integer",
-                    "description": "When specified, limits the output to the top K most probable classes."
-                }
-            }
-        }
-    },
-    "required": [
-        "inputs"
-    ]
-}
\ No newline at end of file
+	"id": "http://huggingface.co/inference/schemas/audio-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Audio Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "On or several audio files to classify"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/AudioClassificationParameters"
+		}
+	},
+	"definitions": {
+		"AudioClassificationParameters": {
+			"description": "Additional inference parameters for Audio Classification",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
index dfe0f0c5e1..4985554dbe 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/output.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -1,24 +1,21 @@
 {
-    "id": "http://huggingface.co/inference/schemas/audio-classification/output.schema.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "title": "AudioClassificationOutput",
-    "description": "Outputs for Audio Classification inference",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "label": {
-                "type": "string",
-                "description": "The predicted class label (model specific)."
-            },
-            "score": {
-                "type": "number",
-                "description": "The corresponding probability."
-            }
-        },
-        "required": [
-            "label",
-            "score"
-        ]
-    }
-}
\ No newline at end of file
+	"id": "http://huggingface.co/inference/schemas/audio-classification/output.schema.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "AudioClassificationOutput",
+	"description": "Outputs for Audio Classification inference",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index ef9661482f..2db6493ba6 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Generated on 2024-01-19T14:59:10.562Z
  */
 
-
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-    /**
-     * The text to initialize generation with
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextGenerationParameters;
-    [property: string]: any;
+	/**
+	 * The text to initialize generation with
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	[property: string]: any;
 }
 
 /**
@@ -27,63 +25,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-    /**
-     * Whether to use logit sampling (true) or greedy search (false).
-     */
-    doSample?: boolean;
-    /**
-     * Maximum number of generated tokens.
-     */
-    maxNewTokens?: number;
-    /**
-     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-     * paper](https://hf.co/papers/1909.05858) for more details.
-     */
-    repetitionPenalty?: number;
-    /**
-     * Whether to prepend the prompt to the generated text.
-     */
-    returnFullText?: boolean;
-    /**
-     * Stop generating tokens if a member of `stop_sequences` is generated.
-     */
-    stopSequences?: string[];
-    /**
-     * The value used to modulate the logits distribution.
-     */
-    temperature?: number;
-    /**
-     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-     */
-    topK?: number;
-    /**
-     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-     * up to `top_p` or higher are kept for generation.
-     */
-    topP?: number;
-    /**
-     * Truncate input tokens to the given size.
-     */
-    truncate?: number;
-    /**
-     * Typical Decoding mass. See [Typical Decoding for Natural Language
-     * Generation](https://hf.co/papers/2202.00666) for more information
-     */
-    typicalP?: number;
-    /**
-     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-     */
-    watermark?: boolean;
-    [property: string]: any;
+	/**
+	 * Whether to use logit sampling (true) or greedy search (false).
+	 */
+	doSample?: boolean;
+	/**
+	 * Maximum number of generated tokens.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetitionPenalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	returnFullText?: boolean;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stopSequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typicalP?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: any;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-    /**
-     * The generated text
-     */
-    generatedText: string;
-    [property: string]: any;
+	/**
+	 * The generated text
+	 */
+	generatedText: string;
+	[property: string]: any;
 }
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index e0e73dd682..08f0387022 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -1,84 +1,82 @@
 {
-    "id": "http://huggingface.co/inference/schemas/text-generation/input.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Inputs for Text Generation inference",
-    "type": "object",
-    "properties": {
-        "inputs": {
-            "description": "The text to initialize generation with",
-            "anyOf": [
-                {
-                    "type": "string"
-                },
-                {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            ]
-        },
-        "parameters": {
-            "description": "Additional inference parameters",
-            "$ref": "#/definitions/TextGenerationParameters"
-        }
-    },
-    "definitions": {
-        "TextGenerationParameters": {
-            "description": "Additional inference parameters for Text Generation",
-            "type": "object",
-            "properties": {
-                "doSample": {
-                    "type": "boolean",
-                    "description": "Whether to use logit sampling (true) or greedy search (false)."
-                },
-                "maxNewTokens": {
-                    "type": "integer",
-                    "description": "Maximum number of generated tokens."
-                },
-                "repetitionPenalty": {
-                    "type": "number",
-                    "description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
-                },
-                "returnFullText": {
-                    "type": "boolean",
-                    "description": "Whether to prepend the prompt to the generated text."
-                },
-                "stopSequences": {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    },
-                    "description": "Stop generating tokens if a member of `stop_sequences` is generated."
-                },
-                "temperature": {
-                    "type": "number",
-                    "description": "The value used to modulate the logits distribution."
-                },
-                "topK": {
-                    "type": "integer",
-                    "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
-                },
-                "topP": {
-                    "type": "number",
-                    "description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
-                },
-                "truncate": {
-                    "type": "integer",
-                    "description": "Truncate input tokens to the given size."
-                },
-                "typicalP": {
-                    "type": "number",
-                    "description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
-                },
-                "watermark": {
-                    "type": "boolean",
-                    "description": "Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)"
-                }
-            }
-        }
-    },
-    "required": [
-        "inputs"
-    ]
-}
\ No newline at end of file
+	"id": "http://huggingface.co/inference/schemas/text-generation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text Generation inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The text to initialize generation with",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TextGenerationParameters"
+		}
+	},
+	"definitions": {
+		"TextGenerationParameters": {
+			"description": "Additional inference parameters for Text Generation",
+			"type": "object",
+			"properties": {
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use logit sampling (true) or greedy search (false)."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "Maximum number of generated tokens."
+				},
+				"repetitionPenalty": {
+					"type": "number",
+					"description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
+				},
+				"returnFullText": {
+					"type": "boolean",
+					"description": "Whether to prepend the prompt to the generated text."
+				},
+				"stopSequences": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "Stop generating tokens if a member of `stop_sequences` is generated."
+				},
+				"temperature": {
+					"type": "number",
+					"description": "The value used to modulate the logits distribution."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
+				},
+				"truncate": {
+					"type": "integer",
+					"description": "Truncate input tokens to the given size."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
+				},
+				"watermark": {
+					"type": "boolean",
+					"description": "Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
index 1636539421..ccbeaea209 100644
--- a/packages/tasks/src/tasks/text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -1,18 +1,16 @@
 {
-    "id": "http://huggingface.co/inference/schemas/text-generation/output.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Outputs for Text Generation inference",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "generatedText": {
-                "type": "string",
-                "description": "The generated text"
-            }
-        },
-        "required": [
-            "generatedText"
-        ]
-    }
-}
\ No newline at end of file
+	"id": "http://huggingface.co/inference/schemas/text-generation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for Text Generation inference",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text"
+			}
+		},
+		"required": ["generatedText"]
+	}
+}

From 9a8f327840784d39c91dc6301b07c1a0c2e9f3ed Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 16:54:41 +0100
Subject: [PATCH 06/51] misc fix

---
 packages/tasks/src/tasks/audio-classification/spec/output.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
index 4985554dbe..ddacf5872b 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/output.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/audio-classification/output.schema.json",
+	"id": "http://huggingface.co/inference/schemas/audio-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"title": "AudioClassificationOutput",
 	"description": "Outputs for Audio Classification inference",

From 02ba10c04add23abda5c37a9e8d8f95e8662d25e Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:13:08 +0100
Subject: [PATCH 07/51] =?UTF-8?q?=E2=9C=A8=20Add=20specs=20for=20existing?=
 =?UTF-8?q?=20tasks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../spec/input.json                           | 23 +++++
 .../spec/output.json                          | 16 ++++
 .../tasks/depth-estimation/spec/input.json    | 28 ++++++
 .../tasks/depth-estimation/spec/output.json   |  9 ++
 .../spec/input.json                           | 94 +++++++++++++++++++
 .../spec/output.json                          | 35 +++++++
 .../tasks/feature-extraction/spec/input.json  | 34 +++++++
 .../tasks/feature-extraction/spec/output.json | 58 ++++++++++++
 .../tasks/src/tasks/fill-mask/spec/input.json | 53 +++++++++++
 .../src/tasks/fill-mask/spec/output.json      | 28 ++++++
 .../image-classification/spec/input.json      | 28 ++++++
 .../image-classification/spec/output.json     | 20 ++++
 .../tasks/image-segmentation/spec/input.json  | 51 ++++++++++
 .../tasks/image-segmentation/spec/output.json | 20 ++++
 .../src/tasks/image-to-image/spec/input.json  | 21 +++++
 .../src/tasks/image-to-image/spec/output.json |  9 ++
 .../src/tasks/image-to-text/spec/input.json   | 28 ++++++
 .../src/tasks/image-to-text/spec/output.json  | 16 ++++
 .../tasks/object-detection/spec/input.json    | 28 ++++++
 .../tasks/object-detection/spec/output.json   | 44 +++++++++
 .../tasks/question-answering/spec/input.json  | 77 +++++++++++++++
 .../tasks/question-answering/spec/output.json | 28 ++++++
 .../table-question-answering/spec/input.json  | 44 +++++++++
 .../table-question-answering/spec/output.json | 39 ++++++++
 .../tasks/text-classification/spec/input.json | 54 +++++++++++
 .../text-classification/spec/output.json      | 20 ++++
 .../src/tasks/text-to-speech/spec/input.json  | 34 +++++++
 .../src/tasks/text-to-speech/spec/output.json | 19 ++++
 .../token-classification/spec/input.json      | 72 ++++++++++++++
 .../token-classification/spec/output.json     | 32 +++++++
 .../video-classification/spec/input.json      | 36 +++++++
 .../video-classification/spec/output.json     | 20 ++++
 .../visual-question-answering/spec/input.json | 51 ++++++++++
 .../spec/output.json                          | 20 ++++
 .../zero-shot-classification/spec/input.json  | 60 ++++++++++++
 .../zero-shot-classification/spec/output.json | 20 ++++
 .../spec/input.json                           | 55 +++++++++++
 .../spec/output.json                          | 20 ++++
 .../spec/input.json                           | 50 ++++++++++
 .../spec/output.json                          | 44 +++++++++
 40 files changed, 1438 insertions(+)
 create mode 100644 packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
 create mode 100644 packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
 create mode 100644 packages/tasks/src/tasks/depth-estimation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/depth-estimation/spec/output.json
 create mode 100644 packages/tasks/src/tasks/document-question-answering/spec/input.json
 create mode 100644 packages/tasks/src/tasks/document-question-answering/spec/output.json
 create mode 100644 packages/tasks/src/tasks/feature-extraction/spec/input.json
 create mode 100644 packages/tasks/src/tasks/feature-extraction/spec/output.json
 create mode 100644 packages/tasks/src/tasks/fill-mask/spec/input.json
 create mode 100644 packages/tasks/src/tasks/fill-mask/spec/output.json
 create mode 100644 packages/tasks/src/tasks/image-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/image-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/image-segmentation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/image-segmentation/spec/output.json
 create mode 100644 packages/tasks/src/tasks/image-to-image/spec/input.json
 create mode 100644 packages/tasks/src/tasks/image-to-image/spec/output.json
 create mode 100644 packages/tasks/src/tasks/image-to-text/spec/input.json
 create mode 100644 packages/tasks/src/tasks/image-to-text/spec/output.json
 create mode 100644 packages/tasks/src/tasks/object-detection/spec/input.json
 create mode 100644 packages/tasks/src/tasks/object-detection/spec/output.json
 create mode 100644 packages/tasks/src/tasks/question-answering/spec/input.json
 create mode 100644 packages/tasks/src/tasks/question-answering/spec/output.json
 create mode 100644 packages/tasks/src/tasks/table-question-answering/spec/input.json
 create mode 100644 packages/tasks/src/tasks/table-question-answering/spec/output.json
 create mode 100644 packages/tasks/src/tasks/text-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/text-to-speech/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-to-speech/spec/output.json
 create mode 100644 packages/tasks/src/tasks/token-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/token-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/video-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/video-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/visual-question-answering/spec/input.json
 create mode 100644 packages/tasks/src/tasks/visual-question-answering/spec/output.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
 create mode 100644 packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json

diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
new file mode 100644
index 0000000000..dfd1c4bdb1
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -0,0 +1,23 @@
+{
+	"id": "http://huggingface.co/inference/schemas/automatic-speech-recognition/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Automatic Speech Recognition inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The input audio data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/AutomaticSpeechRecognitionParameters"
+		}
+	},
+	"definitions": {
+		"AutomaticSpeechRecognitionParameters": {
+			"description": "Additional inference parameters for Automatic Speech Recognition",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
new file mode 100644
index 0000000000..e11153af65
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -0,0 +1,16 @@
+{
+	"id": "http://huggingface.co/inference/schemas/automatic-speech-recognition/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Automatic Speech Recognition task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"text": {
+				"type": "string",
+				"description": "The recognized text."
+			}
+		},
+		"required": ["text"]
+	}
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
new file mode 100644
index 0000000000..8483f13b50
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/depth-estimation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Depth Estimation inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/DepthEstimationParameters"
+		}
+	},
+	"definitions": {
+		"DepthEstimationParameters": {
+			"description": "Additional inference parameters for Depth Estimation",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/output.json b/packages/tasks/src/tasks/depth-estimation/spec/output.json
new file mode 100644
index 0000000000..643aaaa7b1
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/spec/output.json
@@ -0,0 +1,9 @@
+{
+	"id": "http://huggingface.co/inference/schemas/depth-estimation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Depth Estimation task",
+	"type": "array",
+	"items": {
+		"description": "The output depth labels"
+	}
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
new file mode 100644
index 0000000000..dc72a24b2a
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -0,0 +1,94 @@
+{
+	"id": "http://huggingface.co/inference/schemas/document-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Document Question Answering inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The ",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/DocumentAndQuestion"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/DocumentAndQuestion"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/DocumentQuestionAnsweringParameters"
+		}
+	},
+	"definitions": {
+		"DocumentQuestionAnsweringParameters": {
+			"description": "Additional inference parameters for Document Question Answering",
+			"type": "object",
+			"properties": {
+				"docStride": {
+					"type": "integer",
+					"description": "If the words in the document are too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
+				},
+				"handleImpossibleAnswer": {
+					"type": "boolean",
+					"description": "Whether to accept impossible as an answer"
+				},
+				"lang": {
+					"type": "string",
+					"description": "Language to use while running OCR. Defaults to english."
+				},
+				"maxAnswerLen": {
+					"type": "integer",
+					"description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
+				},
+				"maxSeqLen": {
+					"type": "integer",
+					"description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using doc_stride as overlap) if needed."
+				},
+				"maxQuestionLen": {
+					"type": "integer",
+					"description": "The maximum length of the question after tokenization. It will be truncated if needed."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Can return less than top_k answers if there are not enough options available within the context."
+				},
+				"wordBoxes": {
+					"type": "array",
+					"description": "A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR step and use the provided bounding boxes instead.",
+					"items": {
+						"anyOf": [
+							{
+								"type": "string"
+							},
+							{
+								"type": "array",
+								"items": {
+									"type": "number"
+								},
+								"maxLength": 4,
+								"minLength": 4
+							}
+						]
+					}
+				}
+			}
+		},
+		"DocumentAndQuestion": {
+			"type": "object",
+			"properties": {
+				"image": {
+					"description": "The image on which the question is asked"
+				},
+				"question": {
+					"type": "string",
+					"description": "A question to ask of the document"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/output.json b/packages/tasks/src/tasks/document-question-answering/spec/output.json
new file mode 100644
index 0000000000..60f6b53147
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/spec/output.json
@@ -0,0 +1,35 @@
+{
+	"id": "http://huggingface.co/inference/schemas/document-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Document Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question."
+			},
+			"score": {
+				"type": "number",
+				"description": "The probability associated to the answer."
+			},
+			"start": {
+				"type": "integer",
+				"descrtiption": "The start word index of the answer (in the OCR’d version of the input or provided word boxes)."
+			},
+			"end": {
+				"type": "integer",
+				"descrtiption": "The end word index of the answer (in the OCR’d version of the input or provided word boxes)."
+			},
+			"words": {
+				"type": "array",
+				"items": {
+					"type": "integer"
+				},
+				"description": "The index of each word/box pair that is in the answer"
+			}
+		},
+		"required": ["answer", "score", "start", "end", "words"]
+	}
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
new file mode 100644
index 0000000000..afa1ec9980
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -0,0 +1,34 @@
+{
+	"id": "http://huggingface.co/inference/schemas/feature-extraction/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Feature Extraction inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts to get the features of",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/FeatureExtractionParameters"
+		}
+	},
+	"definitions": {
+		"FeatureExtractionParameters": {
+			"description": "Additional inference parameters for Feature Extraction",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
new file mode 100644
index 0000000000..f2e0ce2bf7
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -0,0 +1,58 @@
+{
+	"id": "http://huggingface.co/inference/schemas/feature-extraction/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Feature Extraction task",
+	"type": "array",
+	"items": {
+		"description": "The features computed by the mode, as a nested list of floats",
+		"$ref": "#/definitions/FeatureDimension"
+	},
+	"definitions": {
+		"FeatureDimension": {
+			"type": "array",
+			"items": {
+				"anyOf": [
+					{
+						"type": "number"
+					},
+					{
+						"type": "array",
+						"items": {
+							"anyOf": [
+								{
+									"type": "number"
+								},
+								{
+									"type": "array",
+									"items": {
+										"anyOf": [
+											{
+												"type": "number"
+											},
+											{
+												"type": "array",
+												"items": {
+													"anyOf": [
+														{
+															"type": "number"
+														},
+														{
+															"type": "array",
+															"items": {
+																"type": "number"
+															}
+														}
+													]
+												}
+											}
+										]
+									}
+								}
+							]
+						}
+					}
+				]
+			}
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
new file mode 100644
index 0000000000..b0588e21ad
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -0,0 +1,53 @@
+{
+	"id": "http://huggingface.co/inference/schemas/fill-mask/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Fill Mask inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts with masked tokens",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/FillMaskParameters"
+		}
+	},
+	"definitions": {
+		"FillMaskParameters": {
+			"description": "Additional inference parameters for Fill Mask",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When passed, overrides the number of predictions to return."
+				},
+				"targets": {
+					"anyOf": [
+						{
+							"type": "string"
+						},
+						{
+							"type": "array",
+							"items": {
+								"type": "string"
+							}
+						}
+					],
+					"description": "When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower)."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/fill-mask/spec/output.json b/packages/tasks/src/tasks/fill-mask/spec/output.json
new file mode 100644
index 0000000000..9ecf5aff88
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/spec/output.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/fill-mask/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Fill Mask task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"sequence": {
+				"type": "string",
+				"description": "The corresponding input with the mask token prediction."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability"
+			},
+			"token": {
+				"type": "integer",
+				"description": "The predicted token id (to replace the masked one)."
+			},
+			"tokenStr": {
+				"type": "string",
+				"description": "The predicted token (to replace the masked one)."
+			}
+		},
+		"required": ["sequence", "score", "token", "tokenStr"]
+	}
+}
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
new file mode 100644
index 0000000000..a9d09224b8
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "On or several image files to classify"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ImageClassificationParameters"
+		}
+	},
+	"definitions": {
+		"ImageClassificationParameters": {
+			"description": "Additional inference parameters for Image Classification",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
new file mode 100644
index 0000000000..f48dc3e770
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
new file mode 100644
index 0000000000..4063d6619f
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -0,0 +1,51 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-segmentation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image Segmentation inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several image files to perform segmentation on"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ImageSegmentationParameters"
+		}
+	},
+	"definitions": {
+		"ImageSegmentationParameters": {
+			"description": "Additional inference parameters for Image Segmentation",
+			"type": "object",
+			"properties": {
+				"maskThreshold": {
+					"type": "number",
+					"description": "Threshold to use when turning the predicted masks into binary values."
+				},
+				"overlapMaskAreaThreshold": {
+					"type": "number",
+					"description": "Mask overlap threshold to eliminate small, disconnected segments."
+				},
+				"subtask": {
+					"type": "string",
+					"description": "Segmentation task to be performed, depending on model capabilities.",
+					"oneOf": [
+						{
+							"const": "instance"
+						},
+						{
+							"const": "panoptic"
+						},
+						{
+							"const": "semantic"
+						}
+					]
+				},
+				"threshold": {
+					"type": "number",
+					"description": "Probability threshold to filter out predicted masks."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
new file mode 100644
index 0000000000..694abf4932
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-segmentation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image Segmentation task",
+	"type": "array",
+	"items": {
+		"description": "A predicted mask / segment",
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The label of the predicted segment"
+			},
+			"mask": {
+				"description": "The corresponding mask as a black-and-white image"
+			}
+		},
+		"required": ["label", "mask"]
+	}
+}
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
new file mode 100644
index 0000000000..2d2978c3a5
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -0,0 +1,21 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-to-image/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image To Image inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or more images to generate images from"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ImageToImageParameters"
+		}
+	},
+	"definitions": {
+		"ImageToImageParameters": {
+			"description": "Additional inference parameters for Image To Image"
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
new file mode 100644
index 0000000000..0ec41e4507
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -0,0 +1,9 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-to-image/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image To Image task",
+	"type": "array",
+	"items": {
+		"description": "The output image"
+	}
+}
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
new file mode 100644
index 0000000000..4055218474
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-to-text/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image To Text inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several images to generated text for"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ImageToTextParameters"
+		}
+	},
+	"definitions": {
+		"ImageToTextParameters": {
+			"description": "Additional inference parameters for Image To Text",
+			"type": "object",
+			"properties": {
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The amount of maximum tokens to generate."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/image-to-text/spec/output.json b/packages/tasks/src/tasks/image-to-text/spec/output.json
new file mode 100644
index 0000000000..0c0392b50c
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/spec/output.json
@@ -0,0 +1,16 @@
+{
+	"id": "http://huggingface.co/inference/schemas/image-to-text/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image To Text task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text."
+			}
+		},
+		"required": ["generatedText"]
+	}
+}
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
new file mode 100644
index 0000000000..7698570f6a
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/object-detection/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Object Detection inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several input images to perform object detection on"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ObjectDetectionParameters"
+		}
+	},
+	"definitions": {
+		"ObjectDetectionParameters": {
+			"description": "Additional inference parameters for Object Detection",
+			"type": "object",
+			"properties": {
+				"threshold": {
+					"type": "number",
+					"description": "The probability necessary to make a prediction."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
new file mode 100644
index 0000000000..ef46a22655
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -0,0 +1,44 @@
+{
+	"id": "http://huggingface.co/inference/schemas/object-detection/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Object Detection task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted label for the bounding box"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"box": {
+				"$ref": "#/definitions/BoundingBox",
+				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
+			}
+		},
+		"required": ["box", "label", "score"]
+	},
+	"definitions": {
+		"BoundingBox": {
+			"type": "object",
+			"properties": {
+				"xmin": {
+					"type": "integer"
+				},
+				"xmax": {
+					"type": "integer"
+				},
+				"ymin": {
+					"type": "integer"
+				},
+				"ymax": {
+					"type": "integer"
+				}
+			},
+			"required": ["xmin", "xmax", "ymin", "ymax"]
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
new file mode 100644
index 0000000000..9f1737f034
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -0,0 +1,77 @@
+{
+	"id": "http://huggingface.co/inference/schemas/question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Question Answering inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several question+context pairs to answer",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/QuestionAnsweringInput"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/QuestionAnsweringInput"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/QuestionAnsweringParameters"
+		}
+	},
+	"definitions": {
+		"QuestionAnsweringInput": {
+			"type": "object",
+			"properties": {
+				"question": {
+					"type": "string",
+					"description": "The question to be answered"
+				},
+				"context": {
+					"type": "string",
+					"description": "The context to be used for answering the question"
+				}
+			},
+			"required": ["question", "context"]
+		},
+		"QuestionAnsweringParameters": {
+			"description": "Additional inference parameters for Question Answering",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
+				},
+				"docStride": {
+					"type": "integer",
+					"description": "If the context is too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
+				},
+				"maxAnswerLen": {
+					"type": "integer",
+					"description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
+				},
+				"maxSeqLen": {
+					"type": "integer",
+					"description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using docStride as overlap) if needed."
+				},
+				"maxQuestionLen": {
+					"type": "integer",
+					"description": "The maximum length of the question after tokenization. It will be truncated if needed."
+				},
+				"handleImpossibleAnswer": {
+					"type": "boolean",
+					"description": "Whether to accept impossible as an answer."
+				},
+				"alignToWords": {
+					"type": "boolean",
+					"description": "Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on non-space-separated languages (like Japanese or Chinese)"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/question-answering/spec/output.json b/packages/tasks/src/tasks/question-answering/spec/output.json
new file mode 100644
index 0000000000..eea7e8e511
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/spec/output.json
@@ -0,0 +1,28 @@
+{
+	"id": "http://huggingface.co/inference/schemas/question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question."
+			},
+			"score": {
+				"type": "number",
+				"description": "The probability associated to the answer."
+			},
+			"start": {
+				"type": "integer",
+				"description": "The character position in the input where the answer begins."
+			},
+			"end": {
+				"type": "integer",
+				"description": "The character position in the input where the answer ends."
+			}
+		},
+		"required": ["answer", "score", "start", "end"]
+	}
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
new file mode 100644
index 0000000000..8d35bcf718
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -0,0 +1,44 @@
+{
+	"id": "http://huggingface.co/inference/schemas/table-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Table Question Answering inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several questions about a table",
+			"type": "object",
+			"properties": {
+				"table": {
+					"description": "The table to serve as context for the questions",
+					"type": "object"
+				},
+				"question": {
+					"description": "One or several questions to be answered about the table",
+					"anyOf": [
+						{
+							"type": "string"
+						},
+						{
+							"type": "array",
+							"items": {
+								"type": "string"
+							}
+						}
+					]
+				}
+			}
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TableQuestionAnsweringParameters"
+		}
+	},
+	"definitions": {
+		"TableQuestionAnsweringParameters": {
+			"description": "Additional inference parameters for Table Question Answering",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/output.json b/packages/tasks/src/tasks/table-question-answering/spec/output.json
new file mode 100644
index 0000000000..bb7969e3b2
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/spec/output.json
@@ -0,0 +1,39 @@
+{
+	"id": "http://huggingface.co/inference/schemas/table-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Table Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer of the question given the table. If there is an aggregator, the answer will be preceded by `AGGREGATOR >`."
+			},
+			"coordinates": {
+				"type": "array",
+				"description": "Coordinates of the cells of the answers.",
+				"items": {
+					"type": "array",
+					"items": {
+						"type": "integer"
+					},
+					"minLength": 2,
+					"maxLength": 2
+				}
+			},
+			"cells": {
+				"type": "array",
+				"description": "List of strings made up of the answer cell values.",
+				"items": {
+					"type": "string"
+				}
+			},
+			"aggregator": {
+				"type": "string",
+				"description": "If the model has an aggregator, this returns the aggregator."
+			}
+		},
+		"required": ["answer", "cells", "coordinates"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
new file mode 100644
index 0000000000..b3b44d2ab3
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -0,0 +1,54 @@
+{
+	"id": "http://huggingface.co/inference/schemas/text-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts to classify",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TextClassificationParameters"
+		}
+	},
+	"definitions": {
+		"TextClassificationParameters": {
+			"description": "Additional inference parameters for Text Classification",
+			"type": "object",
+			"properties": {
+				"functionToApply": {
+					"type": "string",
+					"description": "The function to apply to the model outputs in order to retrieve the scores.",
+					"oneOf": [
+						{
+							"const": "sigmoid"
+						},
+						{
+							"const": "softmax"
+						},
+						{
+							"const": "none"
+						}
+					]
+				},
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
new file mode 100644
index 0000000000..1c317ed02e
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/text-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
new file mode 100644
index 0000000000..c0e94850e5
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -0,0 +1,34 @@
+{
+	"id": "http://huggingface.co/inference/schemas/text-to-audio/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Audio inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts to generate audio for",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TextToAudioParameters"
+		}
+	},
+	"definitions": {
+		"TextToAudioParameters": {
+			"description": "Additional inference parameters for Text To Audio",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
new file mode 100644
index 0000000000..e8e92bbf8f
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -0,0 +1,19 @@
+{
+	"id": "http://huggingface.co/inference/schemas/text-to-audio/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Audio task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"audio": {
+				"description": "The generated audio waveform."
+			},
+			"samplingRate": {
+				"type": "number",
+				"description": "The sampling rate of the generated audio waveform."
+			}
+		},
+		"required": ["audio", "samplingRate"]
+	}
+}
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
new file mode 100644
index 0000000000..d70c71a81f
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -0,0 +1,72 @@
+{
+	"id": "http://huggingface.co/inference/schemas/token-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Token Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts which tokens are to be classified",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/TokenClassificationParameters"
+		}
+	},
+	"definitions": {
+		"TokenClassificationParameters": {
+			"description": "Additional inference parameters for Token Classification",
+			"type": "object",
+			"properties": {
+				"ignoreLabels": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "A list of labels to ignore"
+				},
+				"stride": {
+					"type": "integer",
+					"description": "The number of overlapping tokens between chunks when splitting the input text."
+				},
+				"aggregationStrategy": {
+					"type": "string",
+					"description": "The strategy used to fuse tokens based on model predictions",
+					"oneOf": [
+						{
+							"const": "none",
+							"description": "Do not aggregate tokens"
+						},
+						{
+							"const": "simple",
+							"description": "Group consecutive tokens with the same label in a single entity."
+						},
+						{
+							"const": "first",
+							"description": "Similar to \"simple\", also preserves word integrity (use the label predicted for the first token in a word)."
+						},
+						{
+							"const": "average",
+							"description": "Similar to \"simple\", also preserves word integrity (uses the label with the highest score, averaged across the word's tokens)."
+						},
+						{
+							"const": "max",
+							"description": "Similar to \"simple\", also preserves word integrity (uses the label with the highest score across the word's tokens)."
+						}
+					]
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/token-classification/spec/output.json b/packages/tasks/src/tasks/token-classification/spec/output.json
new file mode 100644
index 0000000000..0e9a037684
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/spec/output.json
@@ -0,0 +1,32 @@
+{
+	"id": "http://huggingface.co/inference/schemas/token-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Token Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"entityGroup": {
+				"type": "string",
+				"description": "The predicted label for that group of tokens"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"word": {
+				"type": "string",
+				"description": "The corresponding text"
+			},
+			"start": {
+				"type": "integer",
+				"description": "The character position in the input where this group begins."
+			},
+			"end": {
+				"type": "integer",
+				"description": "The character position in the input where this group ends."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
new file mode 100644
index 0000000000..9f58d0bf34
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -0,0 +1,36 @@
+{
+	"id": "http://huggingface.co/inference/schemas/video-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Video Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several videos to be classified"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/VideoClassificationParameters"
+		}
+	},
+	"definitions": {
+		"VideoClassificationParameters": {
+			"description": "Additional inference parameters for Video Classification",
+			"type": "object",
+			"properties": {
+				"numFrames": {
+					"type": "integer",
+					"description": "The number of sampled frames to consider for classification."
+				},
+				"frameSamplingRate": {
+					"type": "integer",
+					"description": "The sampling rate used to select frames from the video."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
new file mode 100644
index 0000000000..aa4e369f1f
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/video-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Video Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
new file mode 100644
index 0000000000..134351b6b5
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -0,0 +1,51 @@
+{
+	"id": "http://huggingface.co/inference/schemas/visual-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Visual Question Answering inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or more image-question pairs",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/VisualQuestionAnsweringInput"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/VisualQuestionAnsweringInput"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/VisualQuestionAnsweringParameters"
+		}
+	},
+	"definitions": {
+		"VisualQuestionAnsweringInput": {
+			"type": "object",
+			"properties": {
+				"image": {
+					"description": "The image."
+				},
+				"question": {
+					"description": "The question to answer based on the image."
+				}
+			},
+			"required": ["question", "image"]
+		},
+		"VisualQuestionAnsweringParameters": {
+			"description": "Additional inference parameters for Visual Question Answering",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/output.json b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
new file mode 100644
index 0000000000..808957434b
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/visual-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Visual Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
new file mode 100644
index 0000000000..3682a9ddfb
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -0,0 +1,60 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several text + candidate labels pairs to classify",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/ZeroShotClassificationInput"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/ZeroShotClassificationInput"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ZeroShotClassificationParameters"
+		}
+	},
+	"definitions": {
+		"ZeroShotClassificationInput": {
+			"type": "object",
+			"properties": {
+				"text": {
+					"type": "string",
+					"description": "The text to classify"
+				},
+				"candidateLabels": {
+					"type": "array",
+					"description": "The set of possible class labels to classify the text into.",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["text", "candidateLabels"]
+		},
+		"ZeroShotClassificationParameters": {
+			"description": "Additional inference parameters for Zero Shot Classification",
+			"type": "object",
+			"properties": {
+				"hypothesisTemplate": {
+					"type": "string",
+					"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
+				},
+				"multiLabel": {
+					"type": "boolean",
+					"description": "Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
new file mode 100644
index 0000000000..478e0bef20
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "A candidate label"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
new file mode 100644
index 0000000000..4e60af72ef
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -0,0 +1,55 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-image-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Image Classification inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several images to classify",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/ZeroShotImageClassificationInput"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/ZeroShotImageClassificationInput"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ZeroShotImageClassificationParameters"
+		}
+	},
+	"definitions": {
+		"ZeroShotImageClassificationInput": {
+			"type": "object",
+			"properties": {
+				"image": {
+					"description": "The image data to classify"
+				},
+				"candidateLabels": {
+					"description": "The candidate labels for this image",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["image", "candidateLabels"]
+		},
+		"ZeroShotImageClassificationParameters": {
+			"description": "Additional inference parameters for Zero Shot Image Classification",
+			"type": "object",
+			"properties": {
+				"hypothesisTemplate": {
+					"type": "string",
+					"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
new file mode 100644
index 0000000000..a400d66224
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-image-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Image Classification task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "A candidate label"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
new file mode 100644
index 0000000000..93e95f25f7
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -0,0 +1,50 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-object-detection/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Object Detection inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several images to perform object detection on",
+			"anyOf": [
+				{
+					"$ref": "#/definitions/ZeroShotObjectDetectionInputs"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/definitions/ZeroShotObjectDetectionInputs"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/definitions/ZeroShotObjectDetectionParameters"
+		}
+	},
+	"definitions": {
+		"ZeroShotObjectDetectionInputs": {
+			"type": "object",
+			"properties": {
+				"image": {
+					"description": "The image data to generate bounding boxes from"
+				},
+				"candidateLabels": {
+					"description": "The candidate labels for this image",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["image", "candidateLabels"]
+		},
+		"ZeroShotObjectDetectionParameters": {
+			"description": "Additional inference parameters for Zero Shot Object Detection",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
new file mode 100644
index 0000000000..c5fd05eb3d
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -0,0 +1,44 @@
+{
+	"id": "http://huggingface.co/inference/schemas/zero-shot-object-detection/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Object Detection task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "A candidate label"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"box": {
+				"$ref": "#/definitions/BoundingBox",
+				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
+			}
+		},
+		"required": ["box", "label", "score"]
+	},
+	"definitions": {
+		"BoundingBox": {
+			"type": "object",
+			"properties": {
+				"xmin": {
+					"type": "integer"
+				},
+				"xmax": {
+					"type": "integer"
+				},
+				"ymin": {
+					"type": "integer"
+				},
+				"ymax": {
+					"type": "integer"
+				}
+			},
+			"required": ["xmin", "xmax", "ymin", "ymax"]
+		}
+	}
+}

From 93c37f5cd9c7c72012229e21dcb9aebbc5d34ddc Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:15:01 +0100
Subject: [PATCH 08/51] =?UTF-8?q?=F0=9F=A9=B9=20Ignore=20placeholder=20whe?=
 =?UTF-8?q?n=20generating=20code?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 6edc31fea7..0dcae447f2 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -65,6 +65,7 @@ async function main() {
 	const allTasks = await Promise.all(
 		(await fs.readdir(tasksDir, { withFileTypes: true }))
 			.filter((entry) => entry.isDirectory())
+			.filter((entry) => entry.name !== "placeholder")
 			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
 	);
 

From bbf72eca743a4d4ce3527d5d0f4b0a68a3e3e9b4 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:16:33 +0100
Subject: [PATCH 09/51] =?UTF-8?q?=F0=9F=A9=B9=20Fix:=20ensure=20spec=20fil?=
 =?UTF-8?q?es=20exist?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 0dcae447f2..9ffe24bd42 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -71,7 +71,7 @@ async function main() {
 
 	for (const { task, dirPath } of allTasks) {
 		const taskSpecDir = path.join(dirPath, "spec");
-		if (!pathExists(taskSpecDir)) {
+		if (!(pathExists(path.join(taskSpecDir, "input.json")) && pathExists(path.join(taskSpecDir, "output.json")))) {
 			console.debug(`No spec found for task ${task} - skipping`);
 			continue;
 		}

From 16a9bebe0cbdbd82d59b2d6dbe1bcbf4177524e7 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:17:13 +0100
Subject: [PATCH 10/51] =?UTF-8?q?=E2=9C=A8=20Generate=20inference=20types?=
 =?UTF-8?q?=20for=20existing=20tasks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tasks/audio-classification/inference.ts   |   2 +-
 .../automatic-speech-recognition/inference.ts |  31 ++++++
 .../src/tasks/depth-estimation/inference.ts   |  33 ++++++
 .../document-question-answering/inference.ts  | 101 ++++++++++++++++++
 .../src/tasks/feature-extraction/inference.ts |  20 ++++
 .../tasks/src/tasks/fill-mask/inference.ts    |  63 +++++++++++
 .../tasks/image-classification/inference.ts   |  48 +++++++++
 .../src/tasks/image-segmentation/inference.ts |  64 +++++++++++
 .../src/tasks/image-to-image/inference.ts     |  20 ++++
 .../src/tasks/image-to-text/inference.ts      |  44 ++++++++
 .../src/tasks/object-detection/inference.ts   |  65 +++++++++++
 .../src/tasks/question-answering/inference.ts |  99 +++++++++++++++++
 .../table-question-answering/inference.ts     |  59 ++++++++++
 .../tasks/text-classification/inference.ts    |  54 ++++++++++
 .../src/tasks/text-generation/inference.ts    |   2 +-
 .../src/tasks/text-to-speech/inference.ts     |  35 ++++++
 .../tasks/token-classification/inference.ts   |  85 +++++++++++++++
 .../tasks/video-classification/inference.ts   |  56 ++++++++++
 .../visual-question-answering/inference.ts    |  63 +++++++++++
 .../zero-shot-classification/inference.ts     |  67 ++++++++++++
 .../inference.ts                              |  61 +++++++++++
 .../zero-shot-object-detection/inference.ts   |  64 +++++++++++
 22 files changed, 1134 insertions(+), 2 deletions(-)
 create mode 100644 packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
 create mode 100644 packages/tasks/src/tasks/depth-estimation/inference.ts
 create mode 100644 packages/tasks/src/tasks/document-question-answering/inference.ts
 create mode 100644 packages/tasks/src/tasks/feature-extraction/inference.ts
 create mode 100644 packages/tasks/src/tasks/fill-mask/inference.ts
 create mode 100644 packages/tasks/src/tasks/image-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/image-segmentation/inference.ts
 create mode 100644 packages/tasks/src/tasks/image-to-image/inference.ts
 create mode 100644 packages/tasks/src/tasks/image-to-text/inference.ts
 create mode 100644 packages/tasks/src/tasks/object-detection/inference.ts
 create mode 100644 packages/tasks/src/tasks/question-answering/inference.ts
 create mode 100644 packages/tasks/src/tasks/table-question-answering/inference.ts
 create mode 100644 packages/tasks/src/tasks/text-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/text-to-speech/inference.ts
 create mode 100644 packages/tasks/src/tasks/token-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/video-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/visual-question-answering/inference.ts
 create mode 100644 packages/tasks/src/tasks/zero-shot-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
 create mode 100644 packages/tasks/src/tasks/zero-shot-object-detection/inference.ts

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index aa0e4e86cd..33764ad334 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T14:59:10.562Z
+ * Generated on 2024-01-19T16:16:01.752Z
  */
 
 /**
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
new file mode 100644
index 0000000000..84540aa706
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -0,0 +1,31 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Automatic Speech Recognition inference
+ */
+export interface AutomaticSpeechRecognitionInput {
+	/**
+	 * The input audio data
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Automatic Speech Recognition task
+ */
+export interface AutomaticSpeechRecognitionOutput {
+	/**
+	 * The recognized text.
+	 */
+	text: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
new file mode 100644
index 0000000000..0feea79b11
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -0,0 +1,33 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Depth Estimation inference
+ */
+export interface DepthEstimationInput {
+	/**
+	 * The input image data
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DepthEstimationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Depth Estimation
+ */
+export interface DepthEstimationParameters {
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
new file mode 100644
index 0000000000..ac1f848651
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -0,0 +1,101 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Document Question Answering inference
+ */
+export interface DocumentQuestionAnsweringInput {
+	/**
+	 * The
+	 */
+	inputs: DocumentAndQuestion[] | DocumentAndQuestion;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DocumentQuestionAnsweringParameters;
+	[property: string]: any;
+}
+
+export interface DocumentAndQuestion {
+	/**
+	 * The image on which the question is asked
+	 */
+	image?: any;
+	/**
+	 * A question to ask of the document
+	 */
+	question?: string;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Document Question Answering
+ */
+export interface DocumentQuestionAnsweringParameters {
+	/**
+	 * If the words in the document are too long to fit with the question for the model, it will
+	 * be split in several chunks with some overlap. This argument controls the size of that
+	 * overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * Language to use while running OCR. Defaults to english.
+	 */
+	lang?: string;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using doc_stride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Can return less
+	 * than top_k answers if there are not enough options available within the context.
+	 */
+	topK?: number;
+	/**
+	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+	 * skip the OCR step and use the provided bounding boxes instead.
+	 */
+	wordBoxes?: Array<number[] | string>;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Document Question Answering task
+ */
+export interface DocumentQuestionAnsweringOutput {
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	start: number;
+	/**
+	 * The index of each word/box pair that is in the answer
+	 */
+	words: number[];
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
new file mode 100644
index 0000000000..8a043fc20a
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -0,0 +1,20 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Feature Extraction inference
+ */
+export interface FeatureExtractionInput {
+	/**
+	 * One or several texts to get the features of
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
new file mode 100644
index 0000000000..6a49d856d0
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -0,0 +1,63 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Fill Mask inference
+ */
+export interface FillMaskInput {
+	/**
+	 * One or several texts with masked tokens
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: FillMaskParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Fill Mask
+ */
+export interface FillMaskParameters {
+	/**
+	 * When passed, the model will limit the scores to the passed targets instead of looking up
+	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+	 * tokenized and the first resulting token will be used (with a warning, and that might be
+	 * slower).
+	 */
+	targets?: string[] | string;
+	/**
+	 * When passed, overrides the number of predictions to return.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Fill Mask task
+ */
+export interface FillMaskOutput {
+	/**
+	 * The corresponding probability
+	 */
+	score: number;
+	/**
+	 * The corresponding input with the mask token prediction.
+	 */
+	sequence: string;
+	/**
+	 * The predicted token id (to replace the masked one).
+	 */
+	token: number;
+	/**
+	 * The predicted token (to replace the masked one).
+	 */
+	tokenStr: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
new file mode 100644
index 0000000000..0d5d438d6f
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -0,0 +1,48 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Image Classification inference
+ */
+export interface ImageClassificationInput {
+	/**
+	 * On or several image files to classify
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageClassificationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image Classification
+ */
+export interface ImageClassificationParameters {
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Image Classification task
+ */
+export interface ImageClassificationOutput {
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
new file mode 100644
index 0000000000..88548bc2b2
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -0,0 +1,64 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Image Segmentation inference
+ */
+export interface ImageSegmentationInput {
+	/**
+	 * One or several image files to perform segmentation on
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageSegmentationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image Segmentation
+ */
+export interface ImageSegmentationParameters {
+	/**
+	 * Threshold to use when turning the predicted masks into binary values.
+	 */
+	maskThreshold?: number;
+	/**
+	 * Mask overlap threshold to eliminate small, disconnected segments.
+	 */
+	overlapMaskAreaThreshold?: number;
+	/**
+	 * Segmentation task to be performed, depending on model capabilities.
+	 */
+	subtask?: Subtask;
+	/**
+	 * Probability threshold to filter out predicted masks.
+	 */
+	threshold?: number;
+	[property: string]: any;
+}
+
+export type Subtask = "instance" | "panoptic" | "semantic";
+
+/**
+ * Outputs of inference for the Image Segmentation task
+ *
+ * A predicted mask / segment
+ */
+export interface ImageSegmentationOutput {
+	/**
+	 * The label of the predicted segment
+	 */
+	label: string;
+	/**
+	 * The corresponding mask as a black-and-white image
+	 */
+	mask: any;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
new file mode 100644
index 0000000000..6150a5eb8d
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -0,0 +1,20 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Image To Image inference
+ */
+export interface ImageToImageInput {
+	/**
+	 * One or more images to generate images from
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: any;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
new file mode 100644
index 0000000000..66f5202486
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -0,0 +1,44 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Image To Text inference
+ */
+export interface ImageToTextInput {
+	/**
+	 * One or several images to generated text for
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToTextParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Text
+ */
+export interface ImageToTextParameters {
+	/**
+	 * The amount of maximum tokens to generate.
+	 */
+	maxNewTokens?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Image To Text task
+ */
+export interface ImageToTextOutput {
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
new file mode 100644
index 0000000000..22edb2cced
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -0,0 +1,65 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Object Detection inference
+ */
+export interface ObjectDetectionInput {
+	/**
+	 * One or several input images to perform object detection on
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ObjectDetectionParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Object Detection
+ */
+export interface ObjectDetectionParameters {
+	/**
+	 * The probability necessary to make a prediction.
+	 */
+	threshold?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Object Detection task
+ */
+export interface ObjectDetectionOutput {
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * The predicted label for the bounding box
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
+
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
new file mode 100644
index 0000000000..829e1fd1bb
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -0,0 +1,99 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Question Answering inference
+ */
+export interface QuestionAnsweringInput {
+	/**
+	 * One or several question+context pairs to answer
+	 */
+	inputs: QuestionAnsweringInputElement[] | QuestionAnsweringInputElement;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: QuestionAnsweringParameters;
+	[property: string]: any;
+}
+
+export interface QuestionAnsweringInputElement {
+	/**
+	 * The context to be used for answering the question
+	 */
+	context: string;
+	/**
+	 * The question to be answered
+	 */
+	question: string;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Question Answering
+ */
+export interface QuestionAnsweringParameters {
+	/**
+	 * Attempts to align the answer to real words. Improves quality on space separated
+	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+	 */
+	alignToWords?: boolean;
+	/**
+	 * If the context is too long to fit with the question for the model, it will be split in
+	 * several chunks with some overlap. This argument controls the size of that overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer.
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using docStride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Question Answering task
+ */
+export interface QuestionAnsweringOutput {
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	/**
+	 * The character position in the input where the answer ends.
+	 */
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	/**
+	 * The character position in the input where the answer begins.
+	 */
+	start: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
new file mode 100644
index 0000000000..8b7a3d2275
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -0,0 +1,59 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Table Question Answering inference
+ */
+export interface TableQuestionAnsweringInput {
+	/**
+	 * One or several questions about a table
+	 */
+	inputs: Inputs;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * One or several questions about a table
+ */
+export interface Inputs {
+	/**
+	 * One or several questions to be answered about the table
+	 */
+	question?: string[] | string;
+	/**
+	 * The table to serve as context for the questions
+	 */
+	table?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Table Question Answering task
+ */
+export interface TableQuestionAnsweringOutput {
+	/**
+	 * If the model has an aggregator, this returns the aggregator.
+	 */
+	aggregator?: string;
+	/**
+	 * The answer of the question given the table. If there is an aggregator, the answer will be
+	 * preceded by `AGGREGATOR >`.
+	 */
+	answer: string;
+	/**
+	 * List of strings made up of the answer cell values.
+	 */
+	cells: string[];
+	/**
+	 * Coordinates of the cells of the answers.
+	 */
+	coordinates: Array<number[]>;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
new file mode 100644
index 0000000000..75a7032a23
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -0,0 +1,54 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Text Classification inference
+ */
+export interface TextClassificationInput {
+	/**
+	 * One or several texts to classify
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextClassificationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Classification
+ */
+export interface TextClassificationParameters {
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: FunctionToApply;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+export type FunctionToApply = "sigmoid" | "softmax" | "none";
+
+/**
+ * Outputs of inference for the Text Classification task
+ */
+export interface TextClassificationOutput {
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 2db6493ba6..86725e74e6 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T14:59:10.562Z
+ * Generated on 2024-01-19T16:16:01.752Z
  */
 
 /**
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
new file mode 100644
index 0000000000..23ad75189b
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -0,0 +1,35 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Text To Audio inference
+ */
+export interface TextToSpeechInput {
+	/**
+	 * One or several texts to generate audio for
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Text To Audio task
+ */
+export interface TextToSpeechOutput {
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: any;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
new file mode 100644
index 0000000000..3b7e5e0b3f
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -0,0 +1,85 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Token Classification inference
+ */
+export interface TokenClassificationInput {
+	/**
+	 * One or several texts which tokens are to be classified
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TokenClassificationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Token Classification
+ */
+export interface TokenClassificationParameters {
+	/**
+	 * The strategy used to fuse tokens based on model predictions
+	 */
+	aggregationStrategy?: AggregationStrategy;
+	/**
+	 * A list of labels to ignore
+	 */
+	ignoreLabels?: string[];
+	/**
+	 * The number of overlapping tokens between chunks when splitting the input text.
+	 */
+	stride?: number;
+	[property: string]: any;
+}
+
+/**
+ * Do not aggregate tokens
+ *
+ * Group consecutive tokens with the same label in a single entity.
+ *
+ * Similar to "simple", also preserves word integrity (use the label predicted for the first
+ * token in a word).
+ *
+ * Similar to "simple", also preserves word integrity (uses the label with the highest
+ * score, averaged across the word's tokens).
+ *
+ * Similar to "simple", also preserves word integrity (uses the label with the highest score
+ * across the word's tokens).
+ */
+export type AggregationStrategy = "none" | "simple" | "first" | "average" | "max";
+
+/**
+ * Outputs of inference for the Token Classification task
+ */
+export interface TokenClassificationOutput {
+	/**
+	 * The character position in the input where this group ends.
+	 */
+	end?: number;
+	/**
+	 * The predicted label for that group of tokens
+	 */
+	entityGroup?: string;
+	label: any;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	/**
+	 * The character position in the input where this group begins.
+	 */
+	start?: number;
+	/**
+	 * The corresponding text
+	 */
+	word?: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
new file mode 100644
index 0000000000..b6faf9d0ff
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -0,0 +1,56 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Video Classification inference
+ */
+export interface VideoClassificationInput {
+	/**
+	 * One or several videos to be classified
+	 */
+	inputs: any;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VideoClassificationParameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Video Classification
+ */
+export interface VideoClassificationParameters {
+	/**
+	 * The sampling rate used to select frames from the video.
+	 */
+	frameSamplingRate?: number;
+	/**
+	 * The number of sampled frames to consider for classification.
+	 */
+	numFrames?: number;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Video Classification task
+ */
+export interface VideoClassificationOutput {
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
new file mode 100644
index 0000000000..c2175d49cd
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -0,0 +1,63 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Visual Question Answering inference
+ */
+export interface VisualQuestionAnsweringInput {
+	/**
+	 * One or more image-question pairs
+	 */
+	inputs: VisualQuestionAnsweringInputElement[] | VisualQuestionAnsweringInputElement;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VisualQuestionAnsweringParameters;
+	[property: string]: any;
+}
+
+export interface VisualQuestionAnsweringInputElement {
+	/**
+	 * The image.
+	 */
+	image: any;
+	/**
+	 * The question to answer based on the image.
+	 */
+	question: any;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Visual Question Answering
+ */
+export interface VisualQuestionAnsweringParameters {
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Visual Question Answering task
+ */
+export interface VisualQuestionAnsweringOutput {
+	/**
+	 * The answer to the question
+	 */
+	answer?: string;
+	label: any;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
new file mode 100644
index 0000000000..67cd325eae
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -0,0 +1,67 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Zero Shot Classification inference
+ */
+export interface ZeroShotClassificationInput {
+	/**
+	 * One or several text + candidate labels pairs to classify
+	 */
+	inputs: ZeroShotClassificationInputElement[] | ZeroShotClassificationInputElement;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotClassificationParameters;
+	[property: string]: any;
+}
+
+export interface ZeroShotClassificationInputElement {
+	/**
+	 * The set of possible class labels to classify the text into.
+	 */
+	candidateLabels: string[];
+	/**
+	 * The text to classify
+	 */
+	text: string;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Classification
+ */
+export interface ZeroShotClassificationParameters {
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	/**
+	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
+	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+	 * considered independent and probabilities are normalized for each candidate.
+	 */
+	multiLabel?: boolean;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Zero Shot Classification task
+ */
+export interface ZeroShotClassificationOutput {
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
new file mode 100644
index 0000000000..21e01d1793
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -0,0 +1,61 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Zero Shot Image Classification inference
+ */
+export interface ZeroShotImageClassificationInput {
+	/**
+	 * One or several images to classify
+	 */
+	inputs: ZeroShotImageClassificationInputElement[] | ZeroShotImageClassificationInputElement;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotImageClassificationParameters;
+	[property: string]: any;
+}
+
+export interface ZeroShotImageClassificationInputElement {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to classify
+	 */
+	image: any;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Image Classification
+ */
+export interface ZeroShotImageClassificationParameters {
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Zero Shot Image Classification task
+ */
+export interface ZeroShotImageClassificationOutput {
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
new file mode 100644
index 0000000000..815c99cfed
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -0,0 +1,64 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Generated on 2024-01-19T16:16:01.752Z
+ */
+
+/**
+ * Inputs for Zero Shot Object Detection inference
+ */
+export interface ZeroShotObjectDetectionInput {
+	/**
+	 * One or several images to perform object detection on
+	 */
+	inputs: ZeroShotObjectDetectionInputs[] | ZeroShotObjectDetectionInputs;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+export interface ZeroShotObjectDetectionInputs {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: any;
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Zero Shot Object Detection task
+ */
+export interface ZeroShotObjectDetectionOutput {
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: any;
+}
+
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: any;
+}

From b27846cbb4253685fcc65cf010694bed9b23e75f Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 17:59:22 +0100
Subject: [PATCH 11/51] =?UTF-8?q?=E2=9C=A8=20Support=20cross-file=20refere?=
 =?UTF-8?q?nces?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 9ffe24bd42..d256c81c58 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -8,7 +8,7 @@ const TYPESCRIPT_HEADER_FILE = `
 /**
  * Inference code generated from the JSON schema spec in ./spec
  * 
- * Generated on ${new Date().toISOString()}
+ * Using src/scripts/inference-codegen
  */
 
 `;
@@ -30,8 +30,14 @@ const rootDirFinder = function (): string {
 	return "";
 };
 
-async function buildInputData(taskId: string, taskSpecDir: string): Promise<InputData> {
-	const schema = new JSONSchemaInput(new FetchingJSONSchemaStore());
+/**
+ *
+ * @param taskId The ID of the task for which we are generating code
+ * @param taskSpecDir The path to the directory where the input.json & output.json files are
+ * @param allSpecFiles An array of paths to all the tasks specs. Allows resolving cross-file references ($ref).
+ */
+async function buildInputData(taskId: string, taskSpecDir: string, allSpecFiles: string[]): Promise<InputData> {
+	const schema = new JSONSchemaInput(new FetchingJSONSchemaStore(), [], allSpecFiles);
 	await schema.addSource({
 		name: `${taskId}-input`,
 		schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }),
@@ -68,6 +74,9 @@ async function main() {
 			.filter((entry) => entry.name !== "placeholder")
 			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
 	);
+	const allSpecFiles = allTasks
+		.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
+		.filter((filepath) => pathExists(filepath));
 
 	for (const { task, dirPath } of allTasks) {
 		const taskSpecDir = path.join(dirPath, "spec");
@@ -78,7 +87,7 @@ async function main() {
 		console.debug(`✨ Generating types for task`, task);
 
 		console.debug("   📦 Building input data");
-		const inputData = await buildInputData(task, taskSpecDir);
+		const inputData = await buildInputData(task, taskSpecDir, allSpecFiles);
 
 		console.debug("   🏭 Generating typescript code");
 		{

From 7d9a9f63d49009629f7564c8ae42bbc60ebb4e8c Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 18:00:36 +0100
Subject: [PATCH 12/51] regen following header change

---
 packages/tasks/src/tasks/audio-classification/inference.ts      | 2 +-
 .../tasks/src/tasks/automatic-speech-recognition/inference.ts   | 2 +-
 packages/tasks/src/tasks/depth-estimation/inference.ts          | 2 +-
 .../tasks/src/tasks/document-question-answering/inference.ts    | 2 +-
 packages/tasks/src/tasks/feature-extraction/inference.ts        | 2 +-
 packages/tasks/src/tasks/fill-mask/inference.ts                 | 2 +-
 packages/tasks/src/tasks/image-classification/inference.ts      | 2 +-
 packages/tasks/src/tasks/image-segmentation/inference.ts        | 2 +-
 packages/tasks/src/tasks/image-to-image/inference.ts            | 2 +-
 packages/tasks/src/tasks/image-to-text/inference.ts             | 2 +-
 packages/tasks/src/tasks/object-detection/inference.ts          | 2 +-
 packages/tasks/src/tasks/question-answering/inference.ts        | 2 +-
 packages/tasks/src/tasks/table-question-answering/inference.ts  | 2 +-
 packages/tasks/src/tasks/text-classification/inference.ts       | 2 +-
 packages/tasks/src/tasks/text-generation/inference.ts           | 2 +-
 packages/tasks/src/tasks/text-to-speech/inference.ts            | 2 +-
 packages/tasks/src/tasks/token-classification/inference.ts      | 2 +-
 packages/tasks/src/tasks/video-classification/inference.ts      | 2 +-
 packages/tasks/src/tasks/visual-question-answering/inference.ts | 2 +-
 packages/tasks/src/tasks/zero-shot-classification/inference.ts  | 2 +-
 .../tasks/src/tasks/zero-shot-image-classification/inference.ts | 2 +-
 .../tasks/src/tasks/zero-shot-object-detection/inference.ts     | 2 +-
 22 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 33764ad334..a1f068a48d 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 84540aa706..87a78c95b9 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 0feea79b11..19fa43c118 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index ac1f848651..dc5d92e051 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index 8a043fc20a..b905bc3dc7 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index 6a49d856d0..c603d73d5b 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 0d5d438d6f..6114ff9cc5 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 88548bc2b2..b9131b11ff 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index 6150a5eb8d..c1c3710330 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 66f5202486..917b8bf0ae 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 22edb2cced..ebcd5eeb10 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 829e1fd1bb..c5df0be044 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 8b7a3d2275..58adb8c06d 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 75a7032a23..dc924889c5 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 86725e74e6..4d86e0a999 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 23ad75189b..75c54a1661 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index 3b7e5e0b3f..dffcdcf387 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index b6faf9d0ff..7b2de7049d 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index c2175d49cd..14dc539e9a 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 67cd325eae..0908a599b6 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 21e01d1793..46e7860c3f 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 815c99cfed..fe227f31f5 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,7 +1,7 @@
 /**
  * Inference code generated from the JSON schema spec in ./spec
  *
- * Generated on 2024-01-19T16:16:01.752Z
+ * Using src/scripts/inference-codegen
  */
 
 /**

From dbd0254d3929b272b023e02b9801fba5652b3e2f Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 18:01:28 +0100
Subject: [PATCH 13/51] =?UTF-8?q?=E2=9C=A8=20Add=20text2text-generation=20?=
 =?UTF-8?q?task=20&=20reference=20it=20from=20summarization/translation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/tasks/summarization/inference.ts      | 59 ++++++++++++++++++
 .../src/tasks/summarization/spec/input.json   |  6 ++
 .../src/tasks/summarization/spec/output.json  |  6 ++
 .../src/tasks/text2text-generation/about.md   | 15 +++++
 .../src/tasks/text2text-generation/data.ts    | 18 ++++++
 .../tasks/text2text-generation/inference.ts   | 55 +++++++++++++++++
 .../text2text-generation/spec/input.json      | 61 +++++++++++++++++++
 .../text2text-generation/spec/output.json     | 16 +++++
 .../tasks/src/tasks/translation/inference.ts  | 59 ++++++++++++++++++
 .../src/tasks/translation/spec/input.json     |  6 ++
 .../src/tasks/translation/spec/output.json    |  6 ++
 11 files changed, 307 insertions(+)
 create mode 100644 packages/tasks/src/tasks/summarization/inference.ts
 create mode 100644 packages/tasks/src/tasks/summarization/spec/input.json
 create mode 100644 packages/tasks/src/tasks/summarization/spec/output.json
 create mode 100644 packages/tasks/src/tasks/text2text-generation/about.md
 create mode 100644 packages/tasks/src/tasks/text2text-generation/data.ts
 create mode 100644 packages/tasks/src/tasks/text2text-generation/inference.ts
 create mode 100644 packages/tasks/src/tasks/text2text-generation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text2text-generation/spec/output.json
 create mode 100644 packages/tasks/src/tasks/translation/inference.ts
 create mode 100644 packages/tasks/src/tasks/translation/spec/input.json
 create mode 100644 packages/tasks/src/tasks/translation/spec/output.json

diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
new file mode 100644
index 0000000000..e4464c7489
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -0,0 +1,59 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Summarization inference
+ *
+ * Inputs for Text2text Generation inference
+ */
+export interface SummarizationInput {
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Parameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Parameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: any };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: any;
+}
+
+export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+
+/**
+ * Outputs for Summarization inference
+ *
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface SummarizationOutput {
+	generatedTex: any;
+	/**
+	 * The generated text.
+	 */
+	generatedText?: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/summarization/spec/input.json b/packages/tasks/src/tasks/summarization/spec/input.json
new file mode 100644
index 0000000000..b7c09d1db8
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/spec/input.json
@@ -0,0 +1,6 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/input.json",
+	"$id": "/inference/schemas/summarization/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Summarization inference"
+}
diff --git a/packages/tasks/src/tasks/summarization/spec/output.json b/packages/tasks/src/tasks/summarization/spec/output.json
new file mode 100644
index 0000000000..df7331ee64
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/spec/output.json
@@ -0,0 +1,6 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/output.json",
+	"$id": "/inference/schemas/summarization/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for Summarization inference"
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/about.md b/packages/tasks/src/tasks/text2text-generation/about.md
new file mode 100644
index 0000000000..fdb4558441
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/about.md
@@ -0,0 +1,15 @@
+## Use Cases
+
+You can contribute this area with common use cases of the task!
+
+## Task Variants
+
+This place can be filled with variants of this task if there's any.
+
+## Inference
+
+This section should have useful information about how to pull a model from Hugging Face Hub that is a part of a library specialized in a task and use it.
+
+## Useful Resources
+
+In this area, you can insert useful resources about how to train or use a model for this task.
diff --git a/packages/tasks/src/tasks/text2text-generation/data.ts b/packages/tasks/src/tasks/text2text-generation/data.ts
new file mode 100644
index 0000000000..7a7097e59c
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/data.ts
@@ -0,0 +1,18 @@
+import type { TaskDataCustom } from "..";
+
+const taskData: TaskDataCustom = {
+	datasets: [],
+	demo: {
+		inputs: [],
+		outputs: [],
+	},
+	isPlaceholder: false,
+	metrics: [],
+	models: [],
+	spaces: [],
+	summary: "",
+	widgetModels: [],
+	youtubeId: undefined,
+};
+
+export default taskData;
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
new file mode 100644
index 0000000000..5f144702fd
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -0,0 +1,55 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Text2text Generation inference
+ */
+export interface Text2TextGenerationInput {
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Parameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Parameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: any };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: any;
+}
+
+export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+
+/**
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface Text2TextGenerationOutput {
+	generatedTex: any;
+	/**
+	 * The generated text.
+	 */
+	generatedText?: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
new file mode 100644
index 0000000000..1b7077c655
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -0,0 +1,61 @@
+{
+	"$id": "/inference/schemas/text2text-generation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text2text Generation inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or more texts to use for text2text generation",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/Text2textGenerationParameters"
+		}
+	},
+	"$defs": {
+		"Text2textGenerationParameters": {
+			"description": "Additional inference parameters for Text2text Generation",
+			"type": "object",
+			"properties": {
+				"cleanUpTokenizationSpaces": {
+					"type": "boolean",
+					"description": "Whether to clean up the potential extra spaces in the text output."
+				},
+				"truncation": {
+					"type": "string",
+					"description": "The truncation strategy to use",
+					"oneOf": [
+						{
+							"const": "do_not_truncate"
+						},
+						{
+							"const": "longest_first"
+						},
+						{
+							"const": "only_first"
+						},
+						{
+							"const": "only_second"
+						}
+					]
+				},
+				"generateParameters": {
+					"type": "object",
+					"description": "Additional parametrization of the text generation algorithm"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
new file mode 100644
index 0000000000..5d6cf0cee7
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -0,0 +1,16 @@
+{
+	"$id": "/inference/schemas/text2text-generation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text2text Generation task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text."
+			}
+		},
+		"required": ["generatedTex"]
+	}
+}
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
new file mode 100644
index 0000000000..db7f74739b
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -0,0 +1,59 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Translation inference
+ *
+ * Inputs for Text2text Generation inference
+ */
+export interface TranslationInput {
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Parameters;
+	[property: string]: any;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Parameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: any };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: any;
+}
+
+export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+
+/**
+ * Outputs for Translation inference
+ *
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface TranslationOutput {
+	generatedTex: any;
+	/**
+	 * The generated text.
+	 */
+	generatedText?: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/translation/spec/input.json b/packages/tasks/src/tasks/translation/spec/input.json
new file mode 100644
index 0000000000..e3aac752cb
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/spec/input.json
@@ -0,0 +1,6 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/input.json",
+	"$id": "/inference/schemas/translation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Translation inference"
+}
diff --git a/packages/tasks/src/tasks/translation/spec/output.json b/packages/tasks/src/tasks/translation/spec/output.json
new file mode 100644
index 0000000000..6dcb98077b
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/spec/output.json
@@ -0,0 +1,6 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/output.json",
+	"$id": "/inference/schemas/translation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for Translation inference"
+}

From 6d903489de637b1f83f31560fbb052683309a17c Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 18:31:54 +0100
Subject: [PATCH 14/51] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Use=20$id,=20$defs?=
 =?UTF-8?q?=20&=20title?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/tasks/audio-classification/spec/input.json |  7 ++++---
 .../tasks/audio-classification/spec/output.json    |  2 +-
 .../automatic-speech-recognition/spec/input.json   |  7 ++++---
 .../automatic-speech-recognition/spec/output.json  |  2 +-
 .../src/tasks/depth-estimation/spec/input.json     |  7 ++++---
 .../src/tasks/depth-estimation/spec/output.json    |  2 +-
 .../tasks/document-question-answering/inference.ts |  4 ++--
 .../document-question-answering/spec/input.json    | 11 ++++++-----
 .../document-question-answering/spec/output.json   |  2 +-
 .../src/tasks/feature-extraction/spec/input.json   |  7 ++++---
 .../src/tasks/feature-extraction/spec/output.json  |  9 +++++----
 packages/tasks/src/tasks/fill-mask/spec/input.json |  7 ++++---
 .../tasks/src/tasks/fill-mask/spec/output.json     |  2 +-
 .../src/tasks/image-classification/spec/input.json |  7 ++++---
 .../tasks/image-classification/spec/output.json    |  2 +-
 .../src/tasks/image-segmentation/spec/input.json   |  7 ++++---
 .../src/tasks/image-segmentation/spec/output.json  |  2 +-
 .../tasks/src/tasks/image-to-image/spec/input.json |  7 ++++---
 .../src/tasks/image-to-image/spec/output.json      |  2 +-
 .../tasks/src/tasks/image-to-text/spec/input.json  |  7 ++++---
 .../tasks/src/tasks/image-to-text/spec/output.json |  2 +-
 .../tasks/src/tasks/object-detection/inference.ts  |  4 ++--
 .../src/tasks/object-detection/spec/input.json     |  7 ++++---
 .../src/tasks/object-detection/spec/output.json    |  6 +++---
 .../src/tasks/question-answering/inference.ts      |  4 ++--
 .../src/tasks/question-answering/spec/input.json   | 14 ++++++++------
 .../src/tasks/question-answering/spec/output.json  |  3 ++-
 .../tasks/src/tasks/summarization/inference.ts     |  6 +++---
 .../tasks/table-question-answering/spec/input.json |  7 ++++---
 .../table-question-answering/spec/output.json      |  2 +-
 .../src/tasks/text-classification/spec/input.json  |  7 ++++---
 .../src/tasks/text-classification/spec/output.json |  2 +-
 .../src/tasks/text-generation/spec/input.json      |  7 ++++---
 .../src/tasks/text-generation/spec/output.json     |  2 +-
 .../tasks/src/tasks/text-to-speech/spec/input.json |  7 ++++---
 .../src/tasks/text-to-speech/spec/output.json      |  2 +-
 .../src/tasks/text2text-generation/inference.ts    |  6 +++---
 .../src/tasks/text2text-generation/spec/input.json |  4 +++-
 .../src/tasks/token-classification/spec/input.json |  7 ++++---
 .../tasks/token-classification/spec/output.json    |  2 +-
 packages/tasks/src/tasks/translation/inference.ts  |  6 +++---
 .../src/tasks/video-classification/spec/input.json |  7 ++++---
 .../tasks/video-classification/spec/output.json    |  2 +-
 .../tasks/visual-question-answering/inference.ts   |  4 ++--
 .../visual-question-answering/spec/input.json      | 14 ++++++++------
 .../visual-question-answering/spec/output.json     |  2 +-
 .../tasks/zero-shot-classification/inference.ts    |  4 ++--
 .../tasks/zero-shot-classification/spec/input.json | 14 ++++++++------
 .../zero-shot-classification/spec/output.json      |  2 +-
 .../zero-shot-image-classification/inference.ts    |  4 ++--
 .../zero-shot-image-classification/spec/input.json | 14 ++++++++------
 .../spec/output.json                               |  2 +-
 .../tasks/zero-shot-object-detection/inference.ts  |  8 ++++----
 .../zero-shot-object-detection/spec/input.json     | 14 ++++++++------
 .../zero-shot-object-detection/spec/output.json    |  6 +++---
 55 files changed, 169 insertions(+), 138 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index f2f3fbfbf8..29357710d4 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/audio-classification/input.json",
+	"$id": "/inference/schemas/audio-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Audio Classification inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/AudioClassificationParameters"
+			"$ref": "#/$defs/AudioClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"AudioClassificationParameters": {
+			"title": "AudioClassificationParameters",
 			"description": "Additional inference parameters for Audio Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
index ddacf5872b..83e7abe71d 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/output.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/audio-classification/output.json",
+	"$id": "/inference/schemas/audio-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"title": "AudioClassificationOutput",
 	"description": "Outputs for Audio Classification inference",
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index dfd1c4bdb1..a4034b5e14 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/automatic-speech-recognition/input.json",
+	"$id": "/inference/schemas/automatic-speech-recognition/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Automatic Speech Recognition inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/AutomaticSpeechRecognitionParameters"
+			"$ref": "#/$defs/AutomaticSpeechRecognitionParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"AutomaticSpeechRecognitionParameters": {
+			"title": "AutomaticSpeechRecognitionParameters",
 			"description": "Additional inference parameters for Automatic Speech Recognition",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
index e11153af65..a8b8af7822 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/automatic-speech-recognition/output.json",
+	"$id": "/inference/schemas/automatic-speech-recognition/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Automatic Speech Recognition task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
index 8483f13b50..f33df64448 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/input.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/depth-estimation/input.json",
+	"$id": "/inference/schemas/depth-estimation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Depth Estimation inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/DepthEstimationParameters"
+			"$ref": "#/$defs/DepthEstimationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"DepthEstimationParameters": {
+			"title": "DepthEstimationParameters",
 			"description": "Additional inference parameters for Depth Estimation",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/output.json b/packages/tasks/src/tasks/depth-estimation/spec/output.json
index 643aaaa7b1..c3ebebcc5d 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/output.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/depth-estimation/output.json",
+	"$id": "/inference/schemas/depth-estimation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Depth Estimation task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index dc5d92e051..5a8eeeb085 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -11,7 +11,7 @@ export interface DocumentQuestionAnsweringInput {
 	/**
 	 * The
 	 */
-	inputs: DocumentAndQuestion[] | DocumentAndQuestion;
+	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface DocumentQuestionAnsweringInput {
 	[property: string]: any;
 }
 
-export interface DocumentAndQuestion {
+export interface DocumentQuestionAnsweringInpu {
 	/**
 	 * The image on which the question is asked
 	 */
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index dc72a24b2a..86d0708c58 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/document-question-answering/input.json",
+	"$id": "/inference/schemas/document-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Document Question Answering inference",
 	"type": "object",
@@ -8,23 +8,24 @@
 			"description": "The ",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/DocumentAndQuestion"
+					"$ref": "#/$defs/DocumentAndQuestion"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/DocumentAndQuestion"
+						"$ref": "#/$defs/DocumentAndQuestion"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/DocumentQuestionAnsweringParameters"
+			"$ref": "#/$defs/DocumentQuestionAnsweringParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"DocumentQuestionAnsweringParameters": {
+			"title": "DocumentQuestionAnsweringParameters",
 			"description": "Additional inference parameters for Document Question Answering",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/output.json b/packages/tasks/src/tasks/document-question-answering/spec/output.json
index 60f6b53147..4c77527757 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/document-question-answering/output.json",
+	"$id": "/inference/schemas/document-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Document Question Answering task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index afa1ec9980..8bf05339a3 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/feature-extraction/input.json",
+	"$id": "/inference/schemas/feature-extraction/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Feature Extraction inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/FeatureExtractionParameters"
+			"$ref": "#/$defs/FeatureExtractionParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"FeatureExtractionParameters": {
+			"title": "FeatureExtractionParameters",
 			"description": "Additional inference parameters for Feature Extraction",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
index f2e0ce2bf7..4fac04cfee 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/output.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -1,14 +1,15 @@
 {
-	"id": "http://huggingface.co/inference/schemas/feature-extraction/output.json",
+	"$id": "/inference/schemas/feature-extraction/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Feature Extraction task",
 	"type": "array",
 	"items": {
 		"description": "The features computed by the mode, as a nested list of floats",
-		"$ref": "#/definitions/FeatureDimension"
+		"$ref": "#/$defs/FeatureTensor"
 	},
-	"definitions": {
-		"FeatureDimension": {
+	"$defs": {
+		"FeatureTensor": {
+			"title": "FeatureTensor",
 			"type": "array",
 			"items": {
 				"anyOf": [
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
index b0588e21ad..6f7402efbe 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/input.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/fill-mask/input.json",
+	"$id": "/inference/schemas/fill-mask/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Fill Mask inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/FillMaskParameters"
+			"$ref": "#/$defs/FillMaskParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"FillMaskParameters": {
+			"title": "FillMaskParameters",
 			"description": "Additional inference parameters for Fill Mask",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/fill-mask/spec/output.json b/packages/tasks/src/tasks/fill-mask/spec/output.json
index 9ecf5aff88..3453d65d42 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/output.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/fill-mask/output.json",
+	"$id": "/inference/schemas/fill-mask/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Fill Mask task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index a9d09224b8..875fae0e0e 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-classification/input.json",
+	"$id": "/inference/schemas/image-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image Classification inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ImageClassificationParameters"
+			"$ref": "#/$defs/ImageClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ImageClassificationParameters": {
+			"title": "ImageClassificationParameters",
 			"description": "Additional inference parameters for Image Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
index f48dc3e770..da8a2a5c7a 100644
--- a/packages/tasks/src/tasks/image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-classification/output.json",
+	"$id": "/inference/schemas/image-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 4063d6619f..5e050b8c7a 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-segmentation/input.json",
+	"$id": "/inference/schemas/image-segmentation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image Segmentation inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ImageSegmentationParameters"
+			"$ref": "#/$defs/ImageSegmentationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ImageSegmentationParameters": {
+			"title": "ImageSegmentationParameters",
 			"description": "Additional inference parameters for Image Segmentation",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
index 694abf4932..80db732e3e 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/output.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-segmentation/output.json",
+	"$id": "/inference/schemas/image-segmentation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image Segmentation task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index 2d2978c3a5..38b1202efa 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-to-image/input.json",
+	"$id": "/inference/schemas/image-to-image/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image To Image inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ImageToImageParameters"
+			"$ref": "#/$defs/ImageToImageParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ImageToImageParameters": {
+			"title": "ImageToImageParameters",
 			"description": "Additional inference parameters for Image To Image"
 		}
 	},
diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
index 0ec41e4507..d9c4f9bf21 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-to-image/output.json",
+	"$id": "/inference/schemas/image-to-image/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Image task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index 4055218474..140f9e27e6 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-to-text/input.json",
+	"$id": "/inference/schemas/image-to-text/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image To Text inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ImageToTextParameters"
+			"$ref": "#/$defs/ImageToTextParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ImageToTextParameters": {
+			"title": "ImageToTextParameters",
 			"description": "Additional inference parameters for Image To Text",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/image-to-text/spec/output.json b/packages/tasks/src/tasks/image-to-text/spec/output.json
index 0c0392b50c..81960cd222 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/image-to-text/output.json",
+	"$id": "/inference/schemas/image-to-text/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Text task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index ebcd5eeb10..1a7785805a 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -40,7 +40,7 @@ export interface ObjectDetectionOutput {
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
 	 */
-	box: BoundingBox;
+	box: Box;
 	/**
 	 * The predicted label for the bounding box
 	 */
@@ -56,7 +56,7 @@ export interface ObjectDetectionOutput {
  * The predicted bounding box. Coordinates are relative to the top left corner of the input
  * image.
  */
-export interface BoundingBox {
+export interface Box {
 	xmax: number;
 	xmin: number;
 	ymax: number;
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index 7698570f6a..f8647e78a9 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/object-detection/input.json",
+	"$id": "/inference/schemas/object-detection/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Object Detection inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ObjectDetectionParameters"
+			"$ref": "#/$defs/ObjectDetectionParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"ObjectDetectionParameters": {
+			"title": "ObjectDetectionParameters",
 			"description": "Additional inference parameters for Object Detection",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
index ef46a22655..41d0ed887d 100644
--- a/packages/tasks/src/tasks/object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/object-detection/output.json",
+	"$id": "/inference/schemas/object-detection/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Object Detection task",
 	"type": "array",
@@ -15,13 +15,13 @@
 				"description": "The associated score / probability"
 			},
 			"box": {
-				"$ref": "#/definitions/BoundingBox",
+				"$ref": "#/$defs/BoundingBox",
 				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
 			}
 		},
 		"required": ["box", "label", "score"]
 	},
-	"definitions": {
+	"$defs": {
 		"BoundingBox": {
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index c5df0be044..493c4b7e51 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -11,7 +11,7 @@ export interface QuestionAnsweringInput {
 	/**
 	 * One or several question+context pairs to answer
 	 */
-	inputs: QuestionAnsweringInputElement[] | QuestionAnsweringInputElement;
+	inputs: SquadExample[] | SquadExample;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface QuestionAnsweringInput {
 	[property: string]: any;
 }
 
-export interface QuestionAnsweringInputElement {
+export interface SquadExample {
 	/**
 	 * The context to be used for answering the question
 	 */
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
index 9f1737f034..9eab32e13a 100644
--- a/packages/tasks/src/tasks/question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/question-answering/input.json",
+	"$id": "/inference/schemas/question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Question Answering inference",
 	"type": "object",
@@ -8,23 +8,24 @@
 			"description": "One or several question+context pairs to answer",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/QuestionAnsweringInput"
+					"$ref": "#/$defs/SquadExample"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/QuestionAnsweringInput"
+						"$ref": "#/$defs/SquadExample"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/QuestionAnsweringParameters"
+			"$ref": "#/$defs/QuestionAnsweringParameters"
 		}
 	},
-	"definitions": {
-		"QuestionAnsweringInput": {
+	"$defs": {
+		"SquadExample": {
+			"title": "SquadExample",
 			"type": "object",
 			"properties": {
 				"question": {
@@ -39,6 +40,7 @@
 			"required": ["question", "context"]
 		},
 		"QuestionAnsweringParameters": {
+			"title": "QuestionAnsweringParameters",
 			"description": "Additional inference parameters for Question Answering",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/question-answering/spec/output.json b/packages/tasks/src/tasks/question-answering/spec/output.json
index eea7e8e511..9da8f988ad 100644
--- a/packages/tasks/src/tasks/question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/question-answering/spec/output.json
@@ -1,6 +1,7 @@
 {
-	"id": "http://huggingface.co/inference/schemas/question-answering/output.json",
+	"$id": "/inference/schemas/question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "QuestionAnsweringOutput",
 	"description": "Outputs of inference for the Question Answering task",
 	"type": "array",
 	"items": {
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index e4464c7489..1b6579d16d 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -17,7 +17,7 @@ export interface SummarizationInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: Parameters;
+	parameters?: Text2TextGenerationParameters;
 	[property: string]: any;
 }
 
@@ -26,7 +26,7 @@ export interface SummarizationInput {
  *
  * Additional inference parameters for Text2text Generation
  */
-export interface Parameters {
+export interface Text2TextGenerationParameters {
 	/**
 	 * Whether to clean up the potential extra spaces in the text output.
 	 */
@@ -34,7 +34,7 @@ export interface Parameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generateParameters?: { [key: string]: any };
+	parameters?: { [key: string]: any };
 	/**
 	 * The truncation strategy to use
 	 */
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index 8d35bcf718..aa7c7231f8 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/table-question-answering/input.json",
+	"$id": "/inference/schemas/table-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Table Question Answering inference",
 	"type": "object",
@@ -30,11 +30,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TableQuestionAnsweringParameters"
+			"$ref": "#/$defs/TableQuestionAnsweringParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TableQuestionAnsweringParameters": {
+			"title": "TableQuestionAnsweringParameters",
 			"description": "Additional inference parameters for Table Question Answering",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/output.json b/packages/tasks/src/tasks/table-question-answering/spec/output.json
index bb7969e3b2..8649006478 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/table-question-answering/output.json",
+	"$id": "/inference/schemas/table-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Table Question Answering task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index b3b44d2ab3..af40fea2eb 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-classification/input.json",
+	"$id": "/inference/schemas/text-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Classification inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TextClassificationParameters"
+			"$ref": "#/$defs/TextClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TextClassificationParameters": {
+			"title": "TextClassificationParameters",
 			"description": "Additional inference parameters for Text Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
index 1c317ed02e..4e6d69ed99 100644
--- a/packages/tasks/src/tasks/text-classification/spec/output.json
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-classification/output.json",
+	"$id": "/inference/schemas/text-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index 08f0387022..9b5d3d08ef 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-generation/input.json",
+	"$id": "/inference/schemas/text-generation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Generation inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TextGenerationParameters"
+			"$ref": "#/$defs/TextGenerationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TextGenerationParameters": {
+			"title": "TextGenerationParameters",
 			"description": "Additional inference parameters for Text Generation",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
index ccbeaea209..4f1eb95e55 100644
--- a/packages/tasks/src/tasks/text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-generation/output.json",
+	"$id": "/inference/schemas/text-generation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs for Text Generation inference",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index c0e94850e5..96febb6fc1 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-to-audio/input.json",
+	"$id": "/inference/schemas/text-to-audio/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text To Audio inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TextToAudioParameters"
+			"$ref": "#/$defs/TextToAudioParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TextToAudioParameters": {
+			"title": "TextToAudioParameters",
 			"description": "Additional inference parameters for Text To Audio",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index e8e92bbf8f..f91a9563ef 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/text-to-audio/output.json",
+	"$id": "/inference/schemas/text-to-audio/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text To Audio task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 5f144702fd..48dd088db8 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -15,7 +15,7 @@ export interface Text2TextGenerationInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: Parameters;
+	parameters?: Text2TextGenerationParameters;
 	[property: string]: any;
 }
 
@@ -24,7 +24,7 @@ export interface Text2TextGenerationInput {
  *
  * Additional inference parameters for Text2text Generation
  */
-export interface Parameters {
+export interface Text2TextGenerationParameters {
 	/**
 	 * Whether to clean up the potential extra spaces in the text output.
 	 */
@@ -32,7 +32,7 @@ export interface Parameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generateParameters?: { [key: string]: any };
+	parameters?: { [key: string]: any };
 	/**
 	 * The truncation strategy to use
 	 */
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index 1b7077c655..bec8fedfc2 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -25,6 +25,7 @@
 	},
 	"$defs": {
 		"Text2textGenerationParameters": {
+			"title": "Text2textGenerationParameters",
 			"description": "Additional inference parameters for Text2text Generation",
 			"type": "object",
 			"properties": {
@@ -50,7 +51,8 @@
 						}
 					]
 				},
-				"generateParameters": {
+				"Parameters": {
+					"title": "generateParameters",
 					"type": "object",
 					"description": "Additional parametrization of the text generation algorithm"
 				}
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index d70c71a81f..8ca4b07d33 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/token-classification/input.json",
+	"$id": "/inference/schemas/token-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Token Classification inference",
 	"type": "object",
@@ -20,11 +20,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/TokenClassificationParameters"
+			"$ref": "#/$defs/TokenClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"TokenClassificationParameters": {
+			"title": "TokenClassificationParameters",
 			"description": "Additional inference parameters for Token Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/token-classification/spec/output.json b/packages/tasks/src/tasks/token-classification/spec/output.json
index 0e9a037684..7685b740b7 100644
--- a/packages/tasks/src/tasks/token-classification/spec/output.json
+++ b/packages/tasks/src/tasks/token-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/token-classification/output.json",
+	"$id": "/inference/schemas/token-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Token Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index db7f74739b..7f4d032a18 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -17,7 +17,7 @@ export interface TranslationInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: Parameters;
+	parameters?: Text2TextGenerationParameters;
 	[property: string]: any;
 }
 
@@ -26,7 +26,7 @@ export interface TranslationInput {
  *
  * Additional inference parameters for Text2text Generation
  */
-export interface Parameters {
+export interface Text2TextGenerationParameters {
 	/**
 	 * Whether to clean up the potential extra spaces in the text output.
 	 */
@@ -34,7 +34,7 @@ export interface Parameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generateParameters?: { [key: string]: any };
+	parameters?: { [key: string]: any };
 	/**
 	 * The truncation strategy to use
 	 */
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 9f58d0bf34..91b9f9642e 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/video-classification/input.json",
+	"$id": "/inference/schemas/video-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Video Classification inference",
 	"type": "object",
@@ -9,11 +9,12 @@
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/VideoClassificationParameters"
+			"$ref": "#/$defs/VideoClassificationParameters"
 		}
 	},
-	"definitions": {
+	"$defs": {
 		"VideoClassificationParameters": {
+			"title": "VideoClassificationParameters",
 			"description": "Additional inference parameters for Video Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
index aa4e369f1f..7121e472fb 100644
--- a/packages/tasks/src/tasks/video-classification/spec/output.json
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/video-classification/output.json",
+	"$id": "/inference/schemas/video-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Video Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 14dc539e9a..05a57db484 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -11,7 +11,7 @@ export interface VisualQuestionAnsweringInput {
 	/**
 	 * One or more image-question pairs
 	 */
-	inputs: VisualQuestionAnsweringInputElement[] | VisualQuestionAnsweringInputElement;
+	inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface VisualQuestionAnsweringInput {
 	[property: string]: any;
 }
 
-export interface VisualQuestionAnsweringInputElement {
+export interface VisualQuestionAnsweringInputSingle {
 	/**
 	 * The image.
 	 */
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index 134351b6b5..cc6e5d93ab 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/visual-question-answering/input.json",
+	"$id": "/inference/schemas/visual-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Visual Question Answering inference",
 	"type": "object",
@@ -8,24 +8,25 @@
 			"description": "One or more image-question pairs",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/VisualQuestionAnsweringInput"
+					"$ref": "#/$defs/VisualQuestionAnsweringInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/VisualQuestionAnsweringInput"
+						"$ref": "#/$defs/VisualQuestionAnsweringInputSingle"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/VisualQuestionAnsweringParameters"
+			"$ref": "#/$defs/VisualQuestionAnsweringParameters"
 		}
 	},
-	"definitions": {
-		"VisualQuestionAnsweringInput": {
+	"$defs": {
+		"VisualQuestionAnsweringInputSingle": {
 			"type": "object",
+			"title": "VisualQuestionAnsweringInputSingle",
 			"properties": {
 				"image": {
 					"description": "The image."
@@ -37,6 +38,7 @@
 			"required": ["question", "image"]
 		},
 		"VisualQuestionAnsweringParameters": {
+			"title": "VisualQuestionAnsweringParameters",
 			"description": "Additional inference parameters for Visual Question Answering",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/output.json b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
index 808957434b..2005d9f2f3 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/visual-question-answering/output.json",
+	"$id": "/inference/schemas/visual-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Visual Question Answering task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 0908a599b6..bc44970985 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -11,7 +11,7 @@ export interface ZeroShotClassificationInput {
 	/**
 	 * One or several text + candidate labels pairs to classify
 	 */
-	inputs: ZeroShotClassificationInputElement[] | ZeroShotClassificationInputElement;
+	inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface ZeroShotClassificationInput {
 	[property: string]: any;
 }
 
-export interface ZeroShotClassificationInputElement {
+export interface ZeroShotClassificationInputSingle {
 	/**
 	 * The set of possible class labels to classify the text into.
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
index 3682a9ddfb..e573f68179 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-classification/input.json",
+	"$id": "/inference/schemas/zero-shot-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Classification inference",
 	"type": "object",
@@ -8,24 +8,25 @@
 			"description": "One or several text + candidate labels pairs to classify",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/ZeroShotClassificationInput"
+					"$ref": "#/$defs/ZeroShotClassificationInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/ZeroShotClassificationInput"
+						"$ref": "#/$defs/ZeroShotClassificationInputSingle"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ZeroShotClassificationParameters"
+			"$ref": "#/$defs/ZeroShotClassificationParameters"
 		}
 	},
-	"definitions": {
-		"ZeroShotClassificationInput": {
+	"$defs": {
+		"ZeroShotClassificationInputSingle": {
 			"type": "object",
+			"title": "ZeroShotClassificationInputSingle",
 			"properties": {
 				"text": {
 					"type": "string",
@@ -42,6 +43,7 @@
 			"required": ["text", "candidateLabels"]
 		},
 		"ZeroShotClassificationParameters": {
+			"title": "ZeroShotClassificationParameters",
 			"description": "Additional inference parameters for Zero Shot Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
index 478e0bef20..54f226d9d8 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-classification/output.json",
+	"$id": "/inference/schemas/zero-shot-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 46e7860c3f..4ae4ff04e0 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -11,7 +11,7 @@ export interface ZeroShotImageClassificationInput {
 	/**
 	 * One or several images to classify
 	 */
-	inputs: ZeroShotImageClassificationInputElement[] | ZeroShotImageClassificationInputElement;
+	inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface ZeroShotImageClassificationInput {
 	[property: string]: any;
 }
 
-export interface ZeroShotImageClassificationInputElement {
+export interface ZeroShotImageClassificationInputSingle {
 	/**
 	 * The candidate labels for this image
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 4e60af72ef..029b19b2dc 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-image-classification/input.json",
+	"$id": "/inference/schemas/zero-shot-image-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Image Classification inference",
 	"type": "object",
@@ -8,24 +8,25 @@
 			"description": "One or several images to classify",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/ZeroShotImageClassificationInput"
+					"$ref": "#/$defs/ZeroShotImageClassificationInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/ZeroShotImageClassificationInput"
+						"$ref": "#/$defs/ZeroShotImageClassificationInputSingle"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ZeroShotImageClassificationParameters"
+			"$ref": "#/$defs/ZeroShotImageClassificationParameters"
 		}
 	},
-	"definitions": {
-		"ZeroShotImageClassificationInput": {
+	"$defs": {
+		"ZeroShotImageClassificationInputSingle": {
 			"type": "object",
+			"title": "ZeroShotImageClassificationInputSingle",
 			"properties": {
 				"image": {
 					"description": "The image data to classify"
@@ -41,6 +42,7 @@
 			"required": ["image", "candidateLabels"]
 		},
 		"ZeroShotImageClassificationParameters": {
+			"title": "ZeroShotImageClassificationParameters",
 			"description": "Additional inference parameters for Zero Shot Image Classification",
 			"type": "object",
 			"properties": {
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
index a400d66224..102944ebcc 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-image-classification/output.json",
+	"$id": "/inference/schemas/zero-shot-image-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Image Classification task",
 	"type": "array",
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index fe227f31f5..64162ae7c5 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -11,7 +11,7 @@ export interface ZeroShotObjectDetectionInput {
 	/**
 	 * One or several images to perform object detection on
 	 */
-	inputs: ZeroShotObjectDetectionInputs[] | ZeroShotObjectDetectionInputs;
+	inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface ZeroShotObjectDetectionInput {
 	[property: string]: any;
 }
 
-export interface ZeroShotObjectDetectionInputs {
+export interface ZeroShotObjectDetectionInputSingle {
 	/**
 	 * The candidate labels for this image
 	 */
@@ -39,7 +39,7 @@ export interface ZeroShotObjectDetectionOutput {
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
 	 */
-	box: BoundingBox;
+	box: Box;
 	/**
 	 * A candidate label
 	 */
@@ -55,7 +55,7 @@ export interface ZeroShotObjectDetectionOutput {
  * The predicted bounding box. Coordinates are relative to the top left corner of the input
  * image.
  */
-export interface BoundingBox {
+export interface Box {
 	xmax: number;
 	xmin: number;
 	ymax: number;
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index 93e95f25f7..f2929226b1 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-object-detection/input.json",
+	"$id": "/inference/schemas/zero-shot-object-detection/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Object Detection inference",
 	"type": "object",
@@ -8,24 +8,25 @@
 			"description": "One or several images to perform object detection on",
 			"anyOf": [
 				{
-					"$ref": "#/definitions/ZeroShotObjectDetectionInputs"
+					"$ref": "#/$defs/ZeroShotObjectDetectionInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/definitions/ZeroShotObjectDetectionInputs"
+						"$ref": "#/$defs/ZeroShotObjectDetectionInputSingle"
 					}
 				}
 			]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
-			"$ref": "#/definitions/ZeroShotObjectDetectionParameters"
+			"$ref": "#/$defs/ZeroShotObjectDetectionParameters"
 		}
 	},
-	"definitions": {
-		"ZeroShotObjectDetectionInputs": {
+	"$defs": {
+		"ZeroShotObjectDetectionInputSingle": {
 			"type": "object",
+			"title": "ZeroShotObjectDetectionInputSingle",
 			"properties": {
 				"image": {
 					"description": "The image data to generate bounding boxes from"
@@ -41,6 +42,7 @@
 			"required": ["image", "candidateLabels"]
 		},
 		"ZeroShotObjectDetectionParameters": {
+			"title": "ZeroShotObjectDetectionParameters",
 			"description": "Additional inference parameters for Zero Shot Object Detection",
 			"type": "object",
 			"properties": {}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
index c5fd05eb3d..0e725af9e1 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -1,5 +1,5 @@
 {
-	"id": "http://huggingface.co/inference/schemas/zero-shot-object-detection/output.json",
+	"$id": "/inference/schemas/zero-shot-object-detection/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Object Detection task",
 	"type": "array",
@@ -15,13 +15,13 @@
 				"description": "The associated score / probability"
 			},
 			"box": {
-				"$ref": "#/definitions/BoundingBox",
+				"$ref": "#/$defs/BoundingBox",
 				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
 			}
 		},
 		"required": ["box", "label", "score"]
 	},
-	"definitions": {
+	"$defs": {
 		"BoundingBox": {
 			"type": "object",
 			"properties": {

From d027115cd2f64a324be5e90f787d25c2c7054a99 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 19 Jan 2024 18:46:58 +0100
Subject: [PATCH 15/51] =?UTF-8?q?=E2=9C=A8=20Add=20sentence=20similarity?=
 =?UTF-8?q?=20task=20spec?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tasks/sentence-similarity/inference.ts    | 30 +++++++++++
 .../tasks/sentence-similarity/spec/input.json | 52 +++++++++++++++++++
 .../sentence-similarity/spec/output.json      | 11 ++++
 3 files changed, 93 insertions(+)
 create mode 100644 packages/tasks/src/tasks/sentence-similarity/inference.ts
 create mode 100644 packages/tasks/src/tasks/sentence-similarity/spec/input.json
 create mode 100644 packages/tasks/src/tasks/sentence-similarity/spec/output.json

diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
new file mode 100644
index 0000000000..43dcc4777d
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -0,0 +1,30 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Sentence similarity inference
+ */
+export interface SentenceSimilarityInput {
+	inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+export interface SentenceSimilarityInputSingle {
+	/**
+	 * A list of strings which will be compared against the source_sentence.
+	 */
+	sentences: string[];
+	/**
+	 * The string that you wish to compare the other strings with. This can be a phrase,
+	 * sentence, or longer passage, depending on the model being used.
+	 */
+	sourceSentence: string;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
new file mode 100644
index 0000000000..cfb884abee
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -0,0 +1,52 @@
+{
+	"$id": "/inference/schemas/sentence-similarity/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Sentence similarity inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"anyOf": [
+				{
+					"$ref": "#/$defs/SentenceSimilarityInputSingle"
+				},
+				{
+					"type": "array",
+					"items": {
+						"$ref": "#/$defs/SentenceSimilarityInputSingle"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/SentenceSimilarityParameters"
+		}
+	},
+	"$defs": {
+		"SentenceSimilarityInputSingle": {
+			"title": "SentenceSimilarityInputSingle",
+			"type": "object",
+			"properties": {
+				"sourceSentence": {
+					"description": "The string that you wish to compare the other strings with. This can be a phrase, sentence, or longer passage, depending on the model being used.",
+					"type": "string"
+				},
+				"sentences": {
+					"type": "array",
+					"description": "A list of strings which will be compared against the source_sentence.",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["sourceSentence", "sentences"]
+		},
+		"SentenceSimilarityParameters": {
+			"title": "SentenceSimilarityParameters",
+			"description": "Additional inference parameters for Sentence Similarity",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/output.json b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
new file mode 100644
index 0000000000..e1fc1c9acb
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
@@ -0,0 +1,11 @@
+{
+	"$id": "/inference/schemas/sentence-similarity/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SentenceSimilarityOutput",
+	"description": "Outputs of inference for the Sentence Similarity task",
+	"type": "array",
+	"items": {
+		"description": "The associated similarity score for each of the given sentences",
+		"type": "number"
+	}
+}

From 224c039a6599e3a4ea674745eb9e386498176435 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 22 Jan 2024 16:03:08 +0100
Subject: [PATCH 16/51] fix typo in text2text-generation spec

---
 .../tasks/src/tasks/text2text-generation/spec/output.json   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
index 5d6cf0cee7..12fe1f3dc8 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -11,6 +11,8 @@
 				"description": "The generated text."
 			}
 		},
-		"required": ["generatedTex"]
+		"required": [
+			"generatedText"
+		]
 	}
-}
+}
\ No newline at end of file

From b8dae864b9ca519ff479b248c2e6ee535966fbd2 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 22 Jan 2024 16:04:22 +0100
Subject: [PATCH 17/51] regenerate code

---
 packages/tasks/src/tasks/summarization/inference.ts         | 3 +--
 packages/tasks/src/tasks/text2text-generation/inference.ts  | 3 +--
 .../tasks/src/tasks/text2text-generation/spec/output.json   | 6 ++----
 packages/tasks/src/tasks/translation/inference.ts           | 3 +--
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index 1b6579d16d..b2e6272c59 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -50,10 +50,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-	generatedTex: any;
 	/**
 	 * The generated text.
 	 */
-	generatedText?: string;
+	generatedText: string;
 	[property: string]: any;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 48dd088db8..4cd8709287 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -46,10 +46,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-	generatedTex: any;
 	/**
 	 * The generated text.
 	 */
-	generatedText?: string;
+	generatedText: string;
 	[property: string]: any;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
index 12fe1f3dc8..190aa6014c 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -11,8 +11,6 @@
 				"description": "The generated text."
 			}
 		},
-		"required": [
-			"generatedText"
-		]
+		"required": ["generatedText"]
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 7f4d032a18..c4f31b0ea4 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -50,10 +50,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-	generatedTex: any;
 	/**
 	 * The generated text.
 	 */
-	generatedText?: string;
+	generatedText: string;
 	[property: string]: any;
 }

From b84825ee03aa7970409ec1dc901ed508e8cd6bd3 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 22 Jan 2024 16:09:38 +0100
Subject: [PATCH 18/51] Have text-to-speech refer to text-to-audio

---
 .../src/tasks/text-to-audio/spec/input.json   | 37 +++++++++++++++++++
 .../src/tasks/text-to-audio/spec/output.json  | 22 +++++++++++
 .../src/tasks/text-to-speech/spec/input.json  | 37 ++-----------------
 .../src/tasks/text-to-speech/spec/output.json | 21 ++---------
 .../src/tasks/text2text-generation/about.md   | 15 --------
 .../src/tasks/text2text-generation/data.ts    | 18 ---------
 6 files changed, 67 insertions(+), 83 deletions(-)
 create mode 100644 packages/tasks/src/tasks/text-to-audio/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-to-audio/spec/output.json
 delete mode 100644 packages/tasks/src/tasks/text2text-generation/about.md
 delete mode 100644 packages/tasks/src/tasks/text2text-generation/data.ts

diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
new file mode 100644
index 0000000000..2ad196f5f5
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -0,0 +1,37 @@
+{
+    "$id": "/inference/schemas/text-to-audio/input.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Inputs for Text To Audio inference",
+    "type": "object",
+    "properties": {
+        "inputs": {
+            "description": "One or several texts to generate audio for",
+            "anyOf": [
+                {
+                    "type": "string"
+                },
+                {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            ]
+        },
+        "parameters": {
+            "description": "Additional inference parameters",
+            "$ref": "#/$defs/TextToAudioParameters"
+        }
+    },
+    "$defs": {
+        "TextToAudioParameters": {
+            "title": "TextToAudioParameters",
+            "description": "Additional inference parameters for Text To Audio",
+            "type": "object",
+            "properties": {}
+        }
+    },
+    "required": [
+        "inputs"
+    ]
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
new file mode 100644
index 0000000000..c66555117c
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -0,0 +1,22 @@
+{
+    "$id": "/inference/schemas/text-to-audio/output.json",
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "description": "Outputs of inference for the Text To Audio task",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "audio": {
+                "description": "The generated audio waveform."
+            },
+            "samplingRate": {
+                "type": "number",
+                "description": "The sampling rate of the generated audio waveform."
+            }
+        },
+        "required": [
+            "audio",
+            "samplingRate"
+        ]
+    }
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index 96febb6fc1..533c7d02df 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -1,35 +1,6 @@
 {
-	"$id": "/inference/schemas/text-to-audio/input.json",
+	"$ref": "/inference/schemas/text-to-audio/input.json",
+	"$id": "/inference/schemas/text-to-speech/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Inputs for Text To Audio inference",
-	"type": "object",
-	"properties": {
-		"inputs": {
-			"description": "One or several texts to generate audio for",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/TextToAudioParameters"
-		}
-	},
-	"$defs": {
-		"TextToAudioParameters": {
-			"title": "TextToAudioParameters",
-			"description": "Additional inference parameters for Text To Audio",
-			"type": "object",
-			"properties": {}
-		}
-	},
-	"required": ["inputs"]
-}
+	"description": "Inputs for Text to Speech inference"
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index f91a9563ef..1b591393f3 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -1,19 +1,6 @@
 {
-	"$id": "/inference/schemas/text-to-audio/output.json",
+	"$ref": "/inference/schemas/text-to-audio/output.json",
+	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Outputs of inference for the Text To Audio task",
-	"type": "array",
-	"items": {
-		"type": "object",
-		"properties": {
-			"audio": {
-				"description": "The generated audio waveform."
-			},
-			"samplingRate": {
-				"type": "number",
-				"description": "The sampling rate of the generated audio waveform."
-			}
-		},
-		"required": ["audio", "samplingRate"]
-	}
-}
+	"description": "Outputs for Text to Speech inference"
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/text2text-generation/about.md b/packages/tasks/src/tasks/text2text-generation/about.md
deleted file mode 100644
index fdb4558441..0000000000
--- a/packages/tasks/src/tasks/text2text-generation/about.md
+++ /dev/null
@@ -1,15 +0,0 @@
-## Use Cases
-
-You can contribute this area with common use cases of the task!
-
-## Task Variants
-
-This place can be filled with variants of this task if there's any.
-
-## Inference
-
-This section should have useful information about how to pull a model from Hugging Face Hub that is a part of a library specialized in a task and use it.
-
-## Useful Resources
-
-In this area, you can insert useful resources about how to train or use a model for this task.
diff --git a/packages/tasks/src/tasks/text2text-generation/data.ts b/packages/tasks/src/tasks/text2text-generation/data.ts
deleted file mode 100644
index 7a7097e59c..0000000000
--- a/packages/tasks/src/tasks/text2text-generation/data.ts
+++ /dev/null
@@ -1,18 +0,0 @@
-import type { TaskDataCustom } from "..";
-
-const taskData: TaskDataCustom = {
-	datasets: [],
-	demo: {
-		inputs: [],
-		outputs: [],
-	},
-	isPlaceholder: false,
-	metrics: [],
-	models: [],
-	spaces: [],
-	summary: "",
-	widgetModels: [],
-	youtubeId: undefined,
-};
-
-export default taskData;

From 4484e394496c558d9f7a90803e9664c85c7eccaa Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Mon, 22 Jan 2024 16:10:39 +0100
Subject: [PATCH 19/51] regenerate code

---
 .../src/tasks/text-to-audio/inference.ts      | 35 ++++++++++
 .../src/tasks/text-to-audio/spec/input.json   | 70 +++++++++----------
 .../src/tasks/text-to-audio/spec/output.json  | 39 +++++------
 .../src/tasks/text-to-speech/inference.ts     |  4 ++
 .../src/tasks/text-to-speech/spec/input.json  |  2 +-
 .../src/tasks/text-to-speech/spec/output.json |  2 +-
 6 files changed, 93 insertions(+), 59 deletions(-)
 create mode 100644 packages/tasks/src/tasks/text-to-audio/inference.ts

diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
new file mode 100644
index 0000000000..3916184bd7
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -0,0 +1,35 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Text To Audio inference
+ */
+export interface TextToAudioInput {
+	/**
+	 * One or several texts to generate audio for
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: any };
+	[property: string]: any;
+}
+
+/**
+ * Outputs of inference for the Text To Audio task
+ */
+export interface TextToAudioOutput {
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: any;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: any;
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index 2ad196f5f5..96febb6fc1 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -1,37 +1,35 @@
 {
-    "$id": "/inference/schemas/text-to-audio/input.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Inputs for Text To Audio inference",
-    "type": "object",
-    "properties": {
-        "inputs": {
-            "description": "One or several texts to generate audio for",
-            "anyOf": [
-                {
-                    "type": "string"
-                },
-                {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            ]
-        },
-        "parameters": {
-            "description": "Additional inference parameters",
-            "$ref": "#/$defs/TextToAudioParameters"
-        }
-    },
-    "$defs": {
-        "TextToAudioParameters": {
-            "title": "TextToAudioParameters",
-            "description": "Additional inference parameters for Text To Audio",
-            "type": "object",
-            "properties": {}
-        }
-    },
-    "required": [
-        "inputs"
-    ]
-}
\ No newline at end of file
+	"$id": "/inference/schemas/text-to-audio/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Audio inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "One or several texts to generate audio for",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextToAudioParameters"
+		}
+	},
+	"$defs": {
+		"TextToAudioParameters": {
+			"title": "TextToAudioParameters",
+			"description": "Additional inference parameters for Text To Audio",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
index c66555117c..f91a9563ef 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -1,22 +1,19 @@
 {
-    "$id": "/inference/schemas/text-to-audio/output.json",
-    "$schema": "http://json-schema.org/draft-06/schema#",
-    "description": "Outputs of inference for the Text To Audio task",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "audio": {
-                "description": "The generated audio waveform."
-            },
-            "samplingRate": {
-                "type": "number",
-                "description": "The sampling rate of the generated audio waveform."
-            }
-        },
-        "required": [
-            "audio",
-            "samplingRate"
-        ]
-    }
-}
\ No newline at end of file
+	"$id": "/inference/schemas/text-to-audio/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Audio task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"audio": {
+				"description": "The generated audio waveform."
+			},
+			"samplingRate": {
+				"type": "number",
+				"description": "The sampling rate of the generated audio waveform."
+			}
+		},
+		"required": ["audio", "samplingRate"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 75c54a1661..137492b27c 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -5,6 +5,8 @@
  */
 
 /**
+ * Inputs for Text to Speech inference
+ *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
@@ -20,6 +22,8 @@ export interface TextToSpeechInput {
 }
 
 /**
+ * Outputs for Text to Speech inference
+ *
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index 533c7d02df..dffbf7910e 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -3,4 +3,4 @@
 	"$id": "/inference/schemas/text-to-speech/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text to Speech inference"
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index 1b591393f3..4678592e8a 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -3,4 +3,4 @@
 	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs for Text to Speech inference"
-}
\ No newline at end of file
+}

From a9c9ae117395348776ae7644affcf9928f1c1228 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 10:57:51 +0100
Subject: [PATCH 20/51] Add quicktype-core from fork

---
 packages/tasks/package.json   |  4 +--
 packages/tasks/pnpm-lock.yaml | 54 ++++++++++++++++++-----------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index 258679abaf..7077133d01 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -43,6 +43,6 @@
 	"license": "MIT",
 	"devDependencies": {
 		"@types/node": "^20.11.5",
-		"quicktype-core": "^23.0.81"
+		"quicktype-core": "https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz"
 	}
-}
+}
\ No newline at end of file
diff --git a/packages/tasks/pnpm-lock.yaml b/packages/tasks/pnpm-lock.yaml
index fedbbb7c3b..0ff78350fc 100644
--- a/packages/tasks/pnpm-lock.yaml
+++ b/packages/tasks/pnpm-lock.yaml
@@ -9,8 +9,8 @@ devDependencies:
     specifier: ^20.11.5
     version: 20.11.5
   quicktype-core:
-    specifier: ^23.0.81
-    version: 23.0.81
+    specifier: https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz
+    version: '@github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz'
 
 packages:
 
@@ -80,8 +80,8 @@ packages:
     resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==}
     dev: true
 
-  /js-base64@3.7.5:
-    resolution: {integrity: sha512-3MEt5DTINKqfScXKfJFrRbxkrnk2AxPWGBL/ycjz4dK8iqiSJ06UxD8jh8xuh6p10TX4t2+7FsBYVxxQbMg+qA==}
+  /js-base64@3.7.6:
+    resolution: {integrity: sha512-NPrWuHFxFUknr1KqJRDgUQPexQF0uIJWjeT+2KjEePhitQxQEx5EJBG1lVn5/hc8aLycTpXrDOgPQ6Zq+EDiTA==}
     dev: true
 
   /lodash@4.17.21:
@@ -118,28 +118,6 @@ packages:
     engines: {node: '>= 0.6.0'}
     dev: true
 
-  /quicktype-core@23.0.81:
-    resolution: {integrity: sha512-iJQpCEzSQIkffJPS5NC+0w+Rq9faGgz09L+WIbseu1toFfj+M/3KTG5jhzdY/uN88fWosAom2fMoEADA403+rQ==}
-    dependencies:
-      '@glideapps/ts-necessities': 2.1.3
-      '@types/urijs': 1.19.25
-      browser-or-node: 2.1.1
-      collection-utils: 1.0.1
-      cross-fetch: 4.0.0
-      is-url: 1.2.4
-      js-base64: 3.7.5
-      lodash: 4.17.21
-      pako: 1.0.11
-      pluralize: 8.0.0
-      readable-stream: 4.4.2
-      unicode-properties: 1.4.1
-      urijs: 1.19.11
-      wordwrap: 1.0.0
-      yaml: 2.3.4
-    transitivePeerDependencies:
-      - encoding
-    dev: true
-
   /readable-stream@4.4.2:
     resolution: {integrity: sha512-Lk/fICSyIhodxy1IDK2HazkeGjSmezAWX2egdtJnYhtzKEsBPJowlI6F6LPb5tqIQILrMbx22S5o3GuJavPusA==}
     engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
@@ -210,3 +188,27 @@ packages:
     resolution: {integrity: sha512-8aAvwVUSHpfEqTQ4w/KMlf3HcRdt50E5ODIQJBw1fQ5RL34xabzxtUlzTXVqc4rkZsPbvrXKWnABCD7kWSmocA==}
     engines: {node: '>= 14'}
     dev: true
+
+  '@github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz':
+    resolution: {tarball: https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz}
+    name: quicktype-core
+    version: 18.0.15
+    dependencies:
+      '@glideapps/ts-necessities': 2.1.3
+      '@types/urijs': 1.19.25
+      browser-or-node: 2.1.1
+      collection-utils: 1.0.1
+      cross-fetch: 4.0.0
+      is-url: 1.2.4
+      js-base64: 3.7.6
+      lodash: 4.17.21
+      pako: 1.0.11
+      pluralize: 8.0.0
+      readable-stream: 4.4.2
+      unicode-properties: 1.4.1
+      urijs: 1.19.11
+      wordwrap: 1.0.0
+      yaml: 2.3.4
+    transitivePeerDependencies:
+      - encoding
+    dev: true

From f9fd4f934edd133ff5b6621304a45ccc14b2baba Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 10:58:13 +0100
Subject: [PATCH 21/51] regenerate code

---
 .../tasks/src/scripts/inference-codegen.ts    |   2 +
 .../tasks/audio-classification/inference.ts   |  50 +++---
 .../automatic-speech-recognition/inference.ts |  32 ++--
 .../src/tasks/depth-estimation/inference.ts   |  34 ++--
 .../document-question-answering/inference.ts  | 150 +++++++++---------
 .../src/tasks/feature-extraction/inference.ts |  22 +--
 .../tasks/feature-extraction/spec/output.json |  86 +++++-----
 .../tasks/src/tasks/fill-mask/inference.ts    |  80 +++++-----
 .../tasks/image-classification/inference.ts   |  50 +++---
 .../src/tasks/image-segmentation/inference.ts |  74 ++++-----
 .../src/tasks/image-to-image/inference.ts     |  24 +--
 .../src/tasks/image-to-text/inference.ts      |  42 ++---
 .../src/tasks/object-detection/inference.ts   |  70 ++++----
 .../src/tasks/question-answering/inference.ts | 146 ++++++++---------
 .../tasks/sentence-similarity/inference.ts    |  38 +++--
 .../src/tasks/summarization/inference.ts      |  58 +++----
 .../table-question-answering/inference.ts     |  76 ++++-----
 .../tasks/text-classification/inference.ts    |  58 +++----
 .../src/tasks/text-generation/inference.ts    | 128 +++++++--------
 .../src/tasks/text-to-audio/inference.ts      |  40 ++---
 .../src/tasks/text-to-speech/inference.ts     |  40 ++---
 .../tasks/text2text-generation/inference.ts   |  58 +++----
 .../tasks/token-classification/inference.ts   |  92 +++++------
 .../tasks/src/tasks/translation/inference.ts  |  58 +++----
 .../tasks/video-classification/inference.ts   |  66 ++++----
 .../visual-question-answering/inference.ts    |  74 ++++-----
 .../zero-shot-classification/inference.ts     |  82 +++++-----
 .../inference.ts                              |  70 ++++----
 .../zero-shot-object-detection/inference.ts   |  78 ++++-----
 29 files changed, 966 insertions(+), 912 deletions(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index d256c81c58..aa92ba5a43 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -61,6 +61,8 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 			"nice-property-names": true,
 			"prefer-unions": true,
 			"prefer-const-values": true,
+			"prefer-unknown": true,
+			// "explicit-unions": true,
 		},
 	});
 }
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index a1f068a48d..d65ead71b9 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-	/**
-	 * On or several audio files to classify
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: AudioClassificationParameters;
-	[property: string]: any;
+    /**
+     * On or several audio files to classify
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: AudioClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,24 +27,24 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 87a78c95b9..abaa0caef2 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,31 +1,33 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Automatic Speech Recognition inference
  */
 export interface AutomaticSpeechRecognitionInput {
-	/**
-	 * The input audio data
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * The input audio data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutput {
-	/**
-	 * The recognized text.
-	 */
-	text: string;
-	[property: string]: any;
+    /**
+     * The recognized text.
+     */
+    text: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 19fa43c118..ba5975a749 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,22 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
+export type DepthEstimationOutput = unknown[];
+
 /**
  * Inputs for Depth Estimation inference
  */
 export interface DepthEstimationInput {
-	/**
-	 * The input image data
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: DepthEstimationParameters;
-	[property: string]: any;
+    /**
+     * The input image data
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DepthEstimationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,9 +29,9 @@ export interface DepthEstimationInput {
  * Additional inference parameters for Depth Estimation
  */
 export interface DepthEstimationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 5a8eeeb085..6c730e277c 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Document Question Answering inference
  */
 export interface DocumentQuestionAnsweringInput {
-	/**
-	 * The
-	 */
-	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: DocumentQuestionAnsweringParameters;
-	[property: string]: any;
+    /**
+     * The
+     */
+    inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DocumentQuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
 export interface DocumentQuestionAnsweringInpu {
-	/**
-	 * The image on which the question is asked
-	 */
-	image?: any;
-	/**
-	 * A question to ask of the document
-	 */
-	question?: string;
-	[property: string]: any;
+    /**
+     * The image on which the question is asked
+     */
+    image?: unknown;
+    /**
+     * A question to ask of the document
+     */
+    question?: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,65 +39,65 @@ export interface DocumentQuestionAnsweringInpu {
  * Additional inference parameters for Document Question Answering
  */
 export interface DocumentQuestionAnsweringParameters {
-	/**
-	 * If the words in the document are too long to fit with the question for the model, it will
-	 * be split in several chunks with some overlap. This argument controls the size of that
-	 * overlap.
-	 */
-	docStride?: number;
-	/**
-	 * Whether to accept impossible as an answer
-	 */
-	handleImpossibleAnswer?: boolean;
-	/**
-	 * Language to use while running OCR. Defaults to english.
-	 */
-	lang?: string;
-	/**
-	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
-	 * considered).
-	 */
-	maxAnswerLen?: number;
-	/**
-	 * The maximum length of the question after tokenization. It will be truncated if needed.
-	 */
-	maxQuestionLen?: number;
-	/**
-	 * The maximum length of the total sentence (context + question) in tokens of each chunk
-	 * passed to the model. The context will be split in several chunks (using doc_stride as
-	 * overlap) if needed.
-	 */
-	maxSeqLen?: number;
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Can return less
-	 * than top_k answers if there are not enough options available within the context.
-	 */
-	topK?: number;
-	/**
-	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-	 * skip the OCR step and use the provided bounding boxes instead.
-	 */
-	wordBoxes?: Array<number[] | string>;
-	[property: string]: any;
+    /**
+     * If the words in the document are too long to fit with the question for the model, it will
+     * be split in several chunks with some overlap. This argument controls the size of that
+     * overlap.
+     */
+    docStride?: number;
+    /**
+     * Whether to accept impossible as an answer
+     */
+    handleImpossibleAnswer?: boolean;
+    /**
+     * Language to use while running OCR. Defaults to english.
+     */
+    lang?: string;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    maxAnswerLen?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    maxQuestionLen?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using doc_stride as
+     * overlap) if needed.
+     */
+    maxSeqLen?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Can return less
+     * than top_k answers if there are not enough options available within the context.
+     */
+    topK?: number;
+    /**
+     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+     * skip the OCR step and use the provided bounding boxes instead.
+     */
+    wordBoxes?: Array<number[] | string>;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Document Question Answering task
  */
 export interface DocumentQuestionAnsweringOutput {
-	/**
-	 * The answer to the question.
-	 */
-	answer: string;
-	end: number;
-	/**
-	 * The probability associated to the answer.
-	 */
-	score: number;
-	start: number;
-	/**
-	 * The index of each word/box pair that is in the answer
-	 */
-	words: number[];
-	[property: string]: any;
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    end:    number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    start: number;
+    /**
+     * The index of each word/box pair that is in the answer
+     */
+    words: number[];
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index b905bc3dc7..c6b6dcec50 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,20 +1,22 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Feature Extraction inference
  */
 export interface FeatureExtractionInput {
-	/**
-	 * One or several texts to get the features of
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several texts to get the features of
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
index 4fac04cfee..47303e9451 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/output.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -3,57 +3,49 @@
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Feature Extraction task",
 	"type": "array",
+	"title": "FeatureExtractionOutput",
 	"items": {
 		"description": "The features computed by the mode, as a nested list of floats",
-		"$ref": "#/$defs/FeatureTensor"
-	},
-	"$defs": {
-		"FeatureTensor": {
-			"title": "FeatureTensor",
-			"type": "array",
-			"items": {
-				"anyOf": [
-					{
-						"type": "number"
-					},
-					{
-						"type": "array",
-						"items": {
-							"anyOf": [
-								{
-									"type": "number"
-								},
-								{
-									"type": "array",
-									"items": {
-										"anyOf": [
-											{
-												"type": "number"
-											},
-											{
-												"type": "array",
-												"items": {
-													"anyOf": [
-														{
-															"type": "number"
-														},
-														{
-															"type": "array",
-															"items": {
-																"type": "number"
-															}
-														}
-													]
+		"anyOf": [
+			{
+				"type": "number"
+			},
+			{
+				"type": "array",
+				"items": {
+					"anyOf": [
+						{
+							"type": "number"
+						},
+						{
+							"type": "array",
+							"items": {
+								"anyOf": [
+									{
+										"type": "number"
+									},
+									{
+										"type": "array",
+										"items": {
+											"anyOf": [
+												{
+													"type": "number"
+												},
+												{
+													"type": "array",
+													"items": {
+														"type": "number"
+													}
 												}
-											}
-										]
+											]
+										}
 									}
-								}
-							]
+								]
+							}
 						}
-					}
-				]
+					]
+				}
 			}
-		}
+		]
 	}
-}
+}
\ No newline at end of file
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index c603d73d5b..380ae8cd28 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Fill Mask inference
  */
 export interface FillMaskInput {
-	/**
-	 * One or several texts with masked tokens
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: FillMaskParameters;
-	[property: string]: any;
+    /**
+     * One or several texts with masked tokens
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: FillMaskParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,39 +27,39 @@ export interface FillMaskInput {
  * Additional inference parameters for Fill Mask
  */
 export interface FillMaskParameters {
-	/**
-	 * When passed, the model will limit the scores to the passed targets instead of looking up
-	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-	 * tokenized and the first resulting token will be used (with a warning, and that might be
-	 * slower).
-	 */
-	targets?: string[] | string;
-	/**
-	 * When passed, overrides the number of predictions to return.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * When passed, the model will limit the scores to the passed targets instead of looking up
+     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+     * tokenized and the first resulting token will be used (with a warning, and that might be
+     * slower).
+     */
+    targets?: string[] | string;
+    /**
+     * When passed, overrides the number of predictions to return.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Fill Mask task
  */
 export interface FillMaskOutput {
-	/**
-	 * The corresponding probability
-	 */
-	score: number;
-	/**
-	 * The corresponding input with the mask token prediction.
-	 */
-	sequence: string;
-	/**
-	 * The predicted token id (to replace the masked one).
-	 */
-	token: number;
-	/**
-	 * The predicted token (to replace the masked one).
-	 */
-	tokenStr: string;
-	[property: string]: any;
+    /**
+     * The corresponding probability
+     */
+    score: number;
+    /**
+     * The corresponding input with the mask token prediction.
+     */
+    sequence: string;
+    /**
+     * The predicted token id (to replace the masked one).
+     */
+    token: number;
+    /**
+     * The predicted token (to replace the masked one).
+     */
+    tokenStr: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 6114ff9cc5..b6700e06e3 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image Classification inference
  */
 export interface ImageClassificationInput {
-	/**
-	 * On or several image files to classify
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageClassificationParameters;
-	[property: string]: any;
+    /**
+     * On or several image files to classify
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,24 +27,24 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index b9131b11ff..63fae52882 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image Segmentation inference
  */
 export interface ImageSegmentationInput {
-	/**
-	 * One or several image files to perform segmentation on
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageSegmentationParameters;
-	[property: string]: any;
+    /**
+     * One or several image files to perform segmentation on
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageSegmentationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,23 +27,23 @@ export interface ImageSegmentationInput {
  * Additional inference parameters for Image Segmentation
  */
 export interface ImageSegmentationParameters {
-	/**
-	 * Threshold to use when turning the predicted masks into binary values.
-	 */
-	maskThreshold?: number;
-	/**
-	 * Mask overlap threshold to eliminate small, disconnected segments.
-	 */
-	overlapMaskAreaThreshold?: number;
-	/**
-	 * Segmentation task to be performed, depending on model capabilities.
-	 */
-	subtask?: Subtask;
-	/**
-	 * Probability threshold to filter out predicted masks.
-	 */
-	threshold?: number;
-	[property: string]: any;
+    /**
+     * Threshold to use when turning the predicted masks into binary values.
+     */
+    maskThreshold?: number;
+    /**
+     * Mask overlap threshold to eliminate small, disconnected segments.
+     */
+    overlapMaskAreaThreshold?: number;
+    /**
+     * Segmentation task to be performed, depending on model capabilities.
+     */
+    subtask?: Subtask;
+    /**
+     * Probability threshold to filter out predicted masks.
+     */
+    threshold?: number;
+    [property: string]: unknown;
 }
 
 export type Subtask = "instance" | "panoptic" | "semantic";
@@ -52,13 +54,13 @@ export type Subtask = "instance" | "panoptic" | "semantic";
  * A predicted mask / segment
  */
 export interface ImageSegmentationOutput {
-	/**
-	 * The label of the predicted segment
-	 */
-	label: string;
-	/**
-	 * The corresponding mask as a black-and-white image
-	 */
-	mask: any;
-	[property: string]: any;
+    /**
+     * The label of the predicted segment
+     */
+    label: string;
+    /**
+     * The corresponding mask as a black-and-white image
+     */
+    mask: unknown;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index c1c3710330..6fb0f997e7 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,20 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
+export type ImageToImageOutput = unknown[];
+
 /**
  * Inputs for Image To Image inference
  */
 export interface ImageToImageInput {
-	/**
-	 * One or more images to generate images from
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: any;
-	[property: string]: any;
+    /**
+     * One or more images to generate images from
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: unknown;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 917b8bf0ae..12e0d49689 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image To Text inference
  */
 export interface ImageToTextInput {
-	/**
-	 * One or several images to generated text for
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageToTextParameters;
-	[property: string]: any;
+    /**
+     * One or several images to generated text for
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageToTextParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,20 +27,20 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
-	/**
-	 * The amount of maximum tokens to generate.
-	 */
-	maxNewTokens?: number;
-	[property: string]: any;
+    /**
+     * The amount of maximum tokens to generate.
+     */
+    maxNewTokens?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image To Text task
  */
 export interface ImageToTextOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 1a7785805a..d294110f7c 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Object Detection inference
  */
 export interface ObjectDetectionInput {
-	/**
-	 * One or several input images to perform object detection on
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ObjectDetectionParameters;
-	[property: string]: any;
+    /**
+     * One or several input images to perform object detection on
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ObjectDetectionParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,31 +27,31 @@ export interface ObjectDetectionInput {
  * Additional inference parameters for Object Detection
  */
 export interface ObjectDetectionParameters {
-	/**
-	 * The probability necessary to make a prediction.
-	 */
-	threshold?: number;
-	[property: string]: any;
+    /**
+     * The probability necessary to make a prediction.
+     */
+    threshold?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Object Detection task
  */
 export interface ObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: Box;
-	/**
-	 * The predicted label for the bounding box
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: Box;
+    /**
+     * The predicted label for the bounding box
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -57,9 +59,9 @@ export interface ObjectDetectionOutput {
  * image.
  */
 export interface Box {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: any;
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 493c4b7e51..75293984d8 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Question Answering inference
  */
 export interface QuestionAnsweringInput {
-	/**
-	 * One or several question+context pairs to answer
-	 */
-	inputs: SquadExample[] | SquadExample;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: QuestionAnsweringParameters;
-	[property: string]: any;
+    /**
+     * One or several question+context pairs to answer
+     */
+    inputs: SquadExample[] | SquadExample;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: QuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
 export interface SquadExample {
-	/**
-	 * The context to be used for answering the question
-	 */
-	context: string;
-	/**
-	 * The question to be answered
-	 */
-	question: string;
-	[property: string]: any;
+    /**
+     * The context to be used for answering the question
+     */
+    context: string;
+    /**
+     * The question to be answered
+     */
+    question: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,63 +39,63 @@ export interface SquadExample {
  * Additional inference parameters for Question Answering
  */
 export interface QuestionAnsweringParameters {
-	/**
-	 * Attempts to align the answer to real words. Improves quality on space separated
-	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
-	 */
-	alignToWords?: boolean;
-	/**
-	 * If the context is too long to fit with the question for the model, it will be split in
-	 * several chunks with some overlap. This argument controls the size of that overlap.
-	 */
-	docStride?: number;
-	/**
-	 * Whether to accept impossible as an answer.
-	 */
-	handleImpossibleAnswer?: boolean;
-	/**
-	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
-	 * considered).
-	 */
-	maxAnswerLen?: number;
-	/**
-	 * The maximum length of the question after tokenization. It will be truncated if needed.
-	 */
-	maxQuestionLen?: number;
-	/**
-	 * The maximum length of the total sentence (context + question) in tokens of each chunk
-	 * passed to the model. The context will be split in several chunks (using docStride as
-	 * overlap) if needed.
-	 */
-	maxSeqLen?: number;
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Note that we
-	 * return less than topk answers if there are not enough options available within the
-	 * context.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * Attempts to align the answer to real words. Improves quality on space separated
+     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+     */
+    alignToWords?: boolean;
+    /**
+     * If the context is too long to fit with the question for the model, it will be split in
+     * several chunks with some overlap. This argument controls the size of that overlap.
+     */
+    docStride?: number;
+    /**
+     * Whether to accept impossible as an answer.
+     */
+    handleImpossibleAnswer?: boolean;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    maxAnswerLen?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    maxQuestionLen?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using docStride as
+     * overlap) if needed.
+     */
+    maxSeqLen?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Question Answering task
  */
 export interface QuestionAnsweringOutput {
-	/**
-	 * The answer to the question.
-	 */
-	answer: string;
-	/**
-	 * The character position in the input where the answer ends.
-	 */
-	end: number;
-	/**
-	 * The probability associated to the answer.
-	 */
-	score: number;
-	/**
-	 * The character position in the input where the answer begins.
-	 */
-	start: number;
-	[property: string]: any;
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    /**
+     * The character position in the input where the answer ends.
+     */
+    end: number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    /**
+     * The character position in the input where the answer begins.
+     */
+    start: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 43dcc4777d..40976f099a 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -1,30 +1,34 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
+export type SentenceSimilarityOutput = number[];
+
 /**
  * Inputs for Sentence similarity inference
  */
 export interface SentenceSimilarityInput {
-	inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 export interface SentenceSimilarityInputSingle {
-	/**
-	 * A list of strings which will be compared against the source_sentence.
-	 */
-	sentences: string[];
-	/**
-	 * The string that you wish to compare the other strings with. This can be a phrase,
-	 * sentence, or longer passage, depending on the model being used.
-	 */
-	sourceSentence: string;
-	[property: string]: any;
+    /**
+     * A list of strings which will be compared against the source_sentence.
+     */
+    sentences: string[];
+    /**
+     * The string that you wish to compare the other strings with. This can be a phrase,
+     * sentence, or longer passage, depending on the model being used.
+     */
+    sourceSentence: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index b2e6272c59..6169e942bd 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Summarization inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface SummarizationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: any;
+    /**
+     * One or more texts to use for text2text generation
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,19 +29,19 @@ export interface SummarizationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: any };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Truncation;
-	[property: string]: any;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    parameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Truncation;
+    [property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -50,9 +52,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 58adb8c06d..22e6b88328 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,59 +1,61 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Table Question Answering inference
  */
 export interface TableQuestionAnsweringInput {
-	/**
-	 * One or several questions about a table
-	 */
-	inputs: Inputs;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several questions about a table
+     */
+    inputs: Inputs;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * One or several questions about a table
  */
 export interface Inputs {
-	/**
-	 * One or several questions to be answered about the table
-	 */
-	question?: string[] | string;
-	/**
-	 * The table to serve as context for the questions
-	 */
-	table?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several questions to be answered about the table
+     */
+    question?: string[] | string;
+    /**
+     * The table to serve as context for the questions
+     */
+    table?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Table Question Answering task
  */
 export interface TableQuestionAnsweringOutput {
-	/**
-	 * If the model has an aggregator, this returns the aggregator.
-	 */
-	aggregator?: string;
-	/**
-	 * The answer of the question given the table. If there is an aggregator, the answer will be
-	 * preceded by `AGGREGATOR >`.
-	 */
-	answer: string;
-	/**
-	 * List of strings made up of the answer cell values.
-	 */
-	cells: string[];
-	/**
-	 * Coordinates of the cells of the answers.
-	 */
-	coordinates: Array<number[]>;
-	[property: string]: any;
+    /**
+     * If the model has an aggregator, this returns the aggregator.
+     */
+    aggregator?: string;
+    /**
+     * The answer of the question given the table. If there is an aggregator, the answer will be
+     * preceded by `AGGREGATOR >`.
+     */
+    answer: string;
+    /**
+     * List of strings made up of the answer cell values.
+     */
+    cells: string[];
+    /**
+     * Coordinates of the cells of the answers.
+     */
+    coordinates: Array<number[]>;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index dc924889c5..c490ee94d2 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text Classification inference
  */
 export interface TextClassificationInput {
-	/**
-	 * One or several texts to classify
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TextClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several texts to classify
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,15 +27,15 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: FunctionToApply;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: FunctionToApply;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 export type FunctionToApply = "sigmoid" | "softmax" | "none";
@@ -42,13 +44,13 @@ export type FunctionToApply = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 4d86e0a999..b28b1f2255 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-	/**
-	 * The text to initialize generation with
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TextGenerationParameters;
-	[property: string]: any;
+    /**
+     * The text to initialize generation with
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,63 +27,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-	/**
-	 * Whether to use logit sampling (true) or greedy search (false).
-	 */
-	doSample?: boolean;
-	/**
-	 * Maximum number of generated tokens.
-	 */
-	maxNewTokens?: number;
-	/**
-	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-	 * paper](https://hf.co/papers/1909.05858) for more details.
-	 */
-	repetitionPenalty?: number;
-	/**
-	 * Whether to prepend the prompt to the generated text.
-	 */
-	returnFullText?: boolean;
-	/**
-	 * Stop generating tokens if a member of `stop_sequences` is generated.
-	 */
-	stopSequences?: string[];
-	/**
-	 * The value used to modulate the logits distribution.
-	 */
-	temperature?: number;
-	/**
-	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-	 */
-	topK?: number;
-	/**
-	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-	 * up to `top_p` or higher are kept for generation.
-	 */
-	topP?: number;
-	/**
-	 * Truncate input tokens to the given size.
-	 */
-	truncate?: number;
-	/**
-	 * Typical Decoding mass. See [Typical Decoding for Natural Language
-	 * Generation](https://hf.co/papers/2202.00666) for more information
-	 */
-	typicalP?: number;
-	/**
-	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-	 */
-	watermark?: boolean;
-	[property: string]: any;
+    /**
+     * Whether to use logit sampling (true) or greedy search (false).
+     */
+    doSample?: boolean;
+    /**
+     * Maximum number of generated tokens.
+     */
+    maxNewTokens?: number;
+    /**
+     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+     * paper](https://hf.co/papers/1909.05858) for more details.
+     */
+    repetitionPenalty?: number;
+    /**
+     * Whether to prepend the prompt to the generated text.
+     */
+    returnFullText?: boolean;
+    /**
+     * Stop generating tokens if a member of `stop_sequences` is generated.
+     */
+    stopSequences?: string[];
+    /**
+     * The value used to modulate the logits distribution.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    topK?: number;
+    /**
+     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+     * up to `top_p` or higher are kept for generation.
+     */
+    topP?: number;
+    /**
+     * Truncate input tokens to the given size.
+     */
+    truncate?: number;
+    /**
+     * Typical Decoding mass. See [Typical Decoding for Natural Language
+     * Generation](https://hf.co/papers/2202.00666) for more information
+     */
+    typicalP?: number;
+    /**
+     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+     */
+    watermark?: boolean;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-	/**
-	 * The generated text
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index 3916184bd7..f263ba8345 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,35 +1,37 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text To Audio inference
  */
 export interface TextToAudioInput {
-	/**
-	 * One or several texts to generate audio for
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several texts to generate audio for
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToAudioOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: any;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	samplingRate: number;
-	[property: string]: any;
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    samplingRate: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 137492b27c..ca08be0050 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text to Speech inference
  *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
-	/**
-	 * One or several texts to generate audio for
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several texts to generate audio for
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
@@ -27,13 +29,13 @@ export interface TextToSpeechInput {
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: any;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	samplingRate: number;
-	[property: string]: any;
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    samplingRate: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 4cd8709287..38aaf3ab2b 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text2text Generation inference
  */
 export interface Text2TextGenerationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: any;
+    /**
+     * One or more texts to use for text2text generation
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,19 +27,19 @@ export interface Text2TextGenerationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: any };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Truncation;
-	[property: string]: any;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    parameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Truncation;
+    [property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -46,9 +48,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index dffcdcf387..7b82c12c63 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Token Classification inference
  */
 export interface TokenClassificationInput {
-	/**
-	 * One or several texts which tokens are to be classified
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TokenClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several texts which tokens are to be classified
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TokenClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,19 +27,19 @@ export interface TokenClassificationInput {
  * Additional inference parameters for Token Classification
  */
 export interface TokenClassificationParameters {
-	/**
-	 * The strategy used to fuse tokens based on model predictions
-	 */
-	aggregationStrategy?: AggregationStrategy;
-	/**
-	 * A list of labels to ignore
-	 */
-	ignoreLabels?: string[];
-	/**
-	 * The number of overlapping tokens between chunks when splitting the input text.
-	 */
-	stride?: number;
-	[property: string]: any;
+    /**
+     * The strategy used to fuse tokens based on model predictions
+     */
+    aggregationStrategy?: AggregationStrategy;
+    /**
+     * A list of labels to ignore
+     */
+    ignoreLabels?: string[];
+    /**
+     * The number of overlapping tokens between chunks when splitting the input text.
+     */
+    stride?: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -60,26 +62,26 @@ export type AggregationStrategy = "none" | "simple" | "first" | "average" | "max
  * Outputs of inference for the Token Classification task
  */
 export interface TokenClassificationOutput {
-	/**
-	 * The character position in the input where this group ends.
-	 */
-	end?: number;
-	/**
-	 * The predicted label for that group of tokens
-	 */
-	entityGroup?: string;
-	label: any;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	/**
-	 * The character position in the input where this group begins.
-	 */
-	start?: number;
-	/**
-	 * The corresponding text
-	 */
-	word?: string;
-	[property: string]: any;
+    /**
+     * The character position in the input where this group ends.
+     */
+    end?: number;
+    /**
+     * The predicted label for that group of tokens
+     */
+    entityGroup?: string;
+    label:        unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    /**
+     * The character position in the input where this group begins.
+     */
+    start?: number;
+    /**
+     * The corresponding text
+     */
+    word?: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index c4f31b0ea4..3b059542ac 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Translation inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface TranslationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: any;
+    /**
+     * One or more texts to use for text2text generation
+     */
+    inputs: string[] | string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,19 +29,19 @@ export interface TranslationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: any };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Truncation;
-	[property: string]: any;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    parameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Truncation;
+    [property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -50,9 +52,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: any;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index 7b2de7049d..d98b4300e9 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Video Classification inference
  */
 export interface VideoClassificationInput {
-	/**
-	 * One or several videos to be classified
-	 */
-	inputs: any;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: VideoClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several videos to be classified
+     */
+    inputs: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VideoClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,32 +27,32 @@ export interface VideoClassificationInput {
  * Additional inference parameters for Video Classification
  */
 export interface VideoClassificationParameters {
-	/**
-	 * The sampling rate used to select frames from the video.
-	 */
-	frameSamplingRate?: number;
-	/**
-	 * The number of sampled frames to consider for classification.
-	 */
-	numFrames?: number;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * The sampling rate used to select frames from the video.
+     */
+    frameSamplingRate?: number;
+    /**
+     * The number of sampled frames to consider for classification.
+     */
+    numFrames?: number;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 05a57db484..da28c988e9 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Visual Question Answering inference
  */
 export interface VisualQuestionAnsweringInput {
-	/**
-	 * One or more image-question pairs
-	 */
-	inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: VisualQuestionAnsweringParameters;
-	[property: string]: any;
+    /**
+     * One or more image-question pairs
+     */
+    inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VisualQuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
 export interface VisualQuestionAnsweringInputSingle {
-	/**
-	 * The image.
-	 */
-	image: any;
-	/**
-	 * The question to answer based on the image.
-	 */
-	question: any;
-	[property: string]: any;
+    /**
+     * The image.
+     */
+    image: unknown;
+    /**
+     * The question to answer based on the image.
+     */
+    question: unknown;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,27 +39,27 @@ export interface VisualQuestionAnsweringInputSingle {
  * Additional inference parameters for Visual Question Answering
  */
 export interface VisualQuestionAnsweringParameters {
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Note that we
-	 * return less than topk answers if there are not enough options available within the
-	 * context.
-	 */
-	topK?: number;
-	[property: string]: any;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Visual Question Answering task
  */
 export interface VisualQuestionAnsweringOutput {
-	/**
-	 * The answer to the question
-	 */
-	answer?: string;
-	label: any;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The answer to the question
+     */
+    answer?: string;
+    label:   unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index bc44970985..59b7cc3cd9 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Classification inference
  */
 export interface ZeroShotClassificationInput {
-	/**
-	 * One or several text + candidate labels pairs to classify
-	 */
-	inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ZeroShotClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several text + candidate labels pairs to classify
+     */
+    inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotClassificationParameters;
+    [property: string]: unknown;
 }
 
 export interface ZeroShotClassificationInputSingle {
-	/**
-	 * The set of possible class labels to classify the text into.
-	 */
-	candidateLabels: string[];
-	/**
-	 * The text to classify
-	 */
-	text: string;
-	[property: string]: any;
+    /**
+     * The set of possible class labels to classify the text into.
+     */
+    candidateLabels: string[];
+    /**
+     * The text to classify
+     */
+    text: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,31 +39,31 @@ export interface ZeroShotClassificationInputSingle {
  * Additional inference parameters for Zero Shot Classification
  */
 export interface ZeroShotClassificationParameters {
-	/**
-	 * The sentence used in conjunction with candidateLabels to attempt the text classification
-	 * by replacing the placeholder with the candidate labels.
-	 */
-	hypothesisTemplate?: string;
-	/**
-	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
-	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
-	 * considered independent and probabilities are normalized for each candidate.
-	 */
-	multiLabel?: boolean;
-	[property: string]: any;
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesisTemplate?: string;
+    /**
+     * Whether multiple candidate labels can be true. If false, the scores are normalized such
+     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+     * considered independent and probabilities are normalized for each candidate.
+     */
+    multiLabel?: boolean;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
 export interface ZeroShotClassificationOutput {
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 4ae4ff04e0..38aafb6a10 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,34 +1,36 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Image Classification inference
  */
 export interface ZeroShotImageClassificationInput {
-	/**
-	 * One or several images to classify
-	 */
-	inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ZeroShotImageClassificationParameters;
-	[property: string]: any;
+    /**
+     * One or several images to classify
+     */
+    inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotImageClassificationParameters;
+    [property: string]: unknown;
 }
 
 export interface ZeroShotImageClassificationInputSingle {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to classify
-	 */
-	image: any;
-	[property: string]: any;
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to classify
+     */
+    image: unknown;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,25 +39,25 @@ export interface ZeroShotImageClassificationInputSingle {
  * Additional inference parameters for Zero Shot Image Classification
  */
 export interface ZeroShotImageClassificationParameters {
-	/**
-	 * The sentence used in conjunction with candidateLabels to attempt the text classification
-	 * by replacing the placeholder with the candidate labels.
-	 */
-	hypothesisTemplate?: string;
-	[property: string]: any;
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesisTemplate?: string;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
 export interface ZeroShotImageClassificationOutput {
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 64162ae7c5..e9ef360bf4 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,54 +1,56 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Object Detection inference
  */
 export interface ZeroShotObjectDetectionInput {
-	/**
-	 * One or several images to perform object detection on
-	 */
-	inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: any };
-	[property: string]: any;
+    /**
+     * One or several images to perform object detection on
+     */
+    inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 export interface ZeroShotObjectDetectionInputSingle {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to generate bounding boxes from
-	 */
-	image: any;
-	[property: string]: any;
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to generate bounding boxes from
+     */
+    image: unknown;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Object Detection task
  */
 export interface ZeroShotObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: Box;
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: any;
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: Box;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -56,9 +58,9 @@ export interface ZeroShotObjectDetectionOutput {
  * image.
  */
 export interface Box {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: any;
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
 }

From d4ec5350b0d56f0153aca155a53617db6f23f8ff Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:00:11 +0100
Subject: [PATCH 22/51] =?UTF-8?q?=F0=9F=92=84format=20with=20pnpm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/package.json                   |   2 +-
 .../tasks/audio-classification/inference.ts   |  50 +++---
 .../automatic-speech-recognition/inference.ts |  32 ++--
 .../src/tasks/depth-estimation/inference.ts   |  32 ++--
 .../document-question-answering/inference.ts  | 150 +++++++++---------
 .../src/tasks/feature-extraction/inference.ts |  22 ++-
 .../tasks/feature-extraction/spec/output.json |   2 +-
 .../tasks/src/tasks/fill-mask/inference.ts    |  80 +++++-----
 .../tasks/image-classification/inference.ts   |  50 +++---
 .../src/tasks/image-segmentation/inference.ts |  74 +++++----
 .../src/tasks/image-to-image/inference.ts     |  22 ++-
 .../src/tasks/image-to-text/inference.ts      |  42 +++--
 .../src/tasks/object-detection/inference.ts   |  70 ++++----
 .../src/tasks/question-answering/inference.ts | 146 +++++++++--------
 .../tasks/sentence-similarity/inference.ts    |  36 ++---
 .../src/tasks/summarization/inference.ts      |  58 ++++---
 .../table-question-answering/inference.ts     |  76 +++++----
 .../tasks/text-classification/inference.ts    |  58 ++++---
 .../src/tasks/text-generation/inference.ts    | 128 ++++++++-------
 .../src/tasks/text-to-audio/inference.ts      |  40 +++--
 .../src/tasks/text-to-speech/inference.ts     |  40 +++--
 .../tasks/text2text-generation/inference.ts   |  58 ++++---
 .../tasks/token-classification/inference.ts   |  92 ++++++-----
 .../tasks/src/tasks/translation/inference.ts  |  58 ++++---
 .../tasks/video-classification/inference.ts   |  66 ++++----
 .../visual-question-answering/inference.ts    |  74 +++++----
 .../zero-shot-classification/inference.ts     |  82 +++++-----
 .../inference.ts                              |  70 ++++----
 .../zero-shot-object-detection/inference.ts   |  78 +++++----
 29 files changed, 867 insertions(+), 921 deletions(-)

diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index 7077133d01..e61a09163d 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -45,4 +45,4 @@
 		"@types/node": "^20.11.5",
 		"quicktype-core": "https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz"
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index d65ead71b9..2ac2e50656 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-    /**
-     * On or several audio files to classify
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: AudioClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * On or several audio files to classify
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: AudioClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,24 +25,24 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index abaa0caef2..6eb20d0c16 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,33 +1,31 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Automatic Speech Recognition inference
  */
 export interface AutomaticSpeechRecognitionInput {
-    /**
-     * The input audio data
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input audio data
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutput {
-    /**
-     * The recognized text.
-     */
-    text: string;
-    [property: string]: unknown;
+	/**
+	 * The recognized text.
+	 */
+	text: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index ba5975a749..48b9d3438e 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type DepthEstimationOutput = unknown[];
 
 /**
  * Inputs for Depth Estimation inference
  */
 export interface DepthEstimationInput {
-    /**
-     * The input image data
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: DepthEstimationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DepthEstimationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,9 +27,9 @@ export interface DepthEstimationInput {
  * Additional inference parameters for Depth Estimation
  */
 export interface DepthEstimationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 6c730e277c..8dec0976f2 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Document Question Answering inference
  */
 export interface DocumentQuestionAnsweringInput {
-    /**
-     * The
-     */
-    inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: DocumentQuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * The
+	 */
+	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DocumentQuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 export interface DocumentQuestionAnsweringInpu {
-    /**
-     * The image on which the question is asked
-     */
-    image?: unknown;
-    /**
-     * A question to ask of the document
-     */
-    question?: string;
-    [property: string]: unknown;
+	/**
+	 * The image on which the question is asked
+	 */
+	image?: unknown;
+	/**
+	 * A question to ask of the document
+	 */
+	question?: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,65 +37,65 @@ export interface DocumentQuestionAnsweringInpu {
  * Additional inference parameters for Document Question Answering
  */
 export interface DocumentQuestionAnsweringParameters {
-    /**
-     * If the words in the document are too long to fit with the question for the model, it will
-     * be split in several chunks with some overlap. This argument controls the size of that
-     * overlap.
-     */
-    docStride?: number;
-    /**
-     * Whether to accept impossible as an answer
-     */
-    handleImpossibleAnswer?: boolean;
-    /**
-     * Language to use while running OCR. Defaults to english.
-     */
-    lang?: string;
-    /**
-     * The maximum length of predicted answers (e.g., only answers with a shorter length are
-     * considered).
-     */
-    maxAnswerLen?: number;
-    /**
-     * The maximum length of the question after tokenization. It will be truncated if needed.
-     */
-    maxQuestionLen?: number;
-    /**
-     * The maximum length of the total sentence (context + question) in tokens of each chunk
-     * passed to the model. The context will be split in several chunks (using doc_stride as
-     * overlap) if needed.
-     */
-    maxSeqLen?: number;
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Can return less
-     * than top_k answers if there are not enough options available within the context.
-     */
-    topK?: number;
-    /**
-     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-     * skip the OCR step and use the provided bounding boxes instead.
-     */
-    wordBoxes?: Array<number[] | string>;
-    [property: string]: unknown;
+	/**
+	 * If the words in the document are too long to fit with the question for the model, it will
+	 * be split in several chunks with some overlap. This argument controls the size of that
+	 * overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * Language to use while running OCR. Defaults to english.
+	 */
+	lang?: string;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using doc_stride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Can return less
+	 * than top_k answers if there are not enough options available within the context.
+	 */
+	topK?: number;
+	/**
+	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+	 * skip the OCR step and use the provided bounding boxes instead.
+	 */
+	wordBoxes?: Array<number[] | string>;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Document Question Answering task
  */
 export interface DocumentQuestionAnsweringOutput {
-    /**
-     * The answer to the question.
-     */
-    answer: string;
-    end:    number;
-    /**
-     * The probability associated to the answer.
-     */
-    score: number;
-    start: number;
-    /**
-     * The index of each word/box pair that is in the answer
-     */
-    words: number[];
-    [property: string]: unknown;
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	start: number;
+	/**
+	 * The index of each word/box pair that is in the answer
+	 */
+	words: number[];
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index c6b6dcec50..5c237d6dd6 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,22 +1,20 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Feature Extraction inference
  */
 export interface FeatureExtractionInput {
-    /**
-     * One or several texts to get the features of
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several texts to get the features of
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
index 47303e9451..54a29d10e2 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/output.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -48,4 +48,4 @@
 			}
 		]
 	}
-}
\ No newline at end of file
+}
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index 380ae8cd28..097718900f 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Fill Mask inference
  */
 export interface FillMaskInput {
-    /**
-     * One or several texts with masked tokens
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: FillMaskParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several texts with masked tokens
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: FillMaskParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,39 +25,39 @@ export interface FillMaskInput {
  * Additional inference parameters for Fill Mask
  */
 export interface FillMaskParameters {
-    /**
-     * When passed, the model will limit the scores to the passed targets instead of looking up
-     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-     * tokenized and the first resulting token will be used (with a warning, and that might be
-     * slower).
-     */
-    targets?: string[] | string;
-    /**
-     * When passed, overrides the number of predictions to return.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When passed, the model will limit the scores to the passed targets instead of looking up
+	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+	 * tokenized and the first resulting token will be used (with a warning, and that might be
+	 * slower).
+	 */
+	targets?: string[] | string;
+	/**
+	 * When passed, overrides the number of predictions to return.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Fill Mask task
  */
 export interface FillMaskOutput {
-    /**
-     * The corresponding probability
-     */
-    score: number;
-    /**
-     * The corresponding input with the mask token prediction.
-     */
-    sequence: string;
-    /**
-     * The predicted token id (to replace the masked one).
-     */
-    token: number;
-    /**
-     * The predicted token (to replace the masked one).
-     */
-    tokenStr: string;
-    [property: string]: unknown;
+	/**
+	 * The corresponding probability
+	 */
+	score: number;
+	/**
+	 * The corresponding input with the mask token prediction.
+	 */
+	sequence: string;
+	/**
+	 * The predicted token id (to replace the masked one).
+	 */
+	token: number;
+	/**
+	 * The predicted token (to replace the masked one).
+	 */
+	tokenStr: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index b6700e06e3..dfff0cfd9b 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image Classification inference
  */
 export interface ImageClassificationInput {
-    /**
-     * On or several image files to classify
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * On or several image files to classify
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,24 +25,24 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 63fae52882..13c15cb72f 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image Segmentation inference
  */
 export interface ImageSegmentationInput {
-    /**
-     * One or several image files to perform segmentation on
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageSegmentationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several image files to perform segmentation on
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageSegmentationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,23 +25,23 @@ export interface ImageSegmentationInput {
  * Additional inference parameters for Image Segmentation
  */
 export interface ImageSegmentationParameters {
-    /**
-     * Threshold to use when turning the predicted masks into binary values.
-     */
-    maskThreshold?: number;
-    /**
-     * Mask overlap threshold to eliminate small, disconnected segments.
-     */
-    overlapMaskAreaThreshold?: number;
-    /**
-     * Segmentation task to be performed, depending on model capabilities.
-     */
-    subtask?: Subtask;
-    /**
-     * Probability threshold to filter out predicted masks.
-     */
-    threshold?: number;
-    [property: string]: unknown;
+	/**
+	 * Threshold to use when turning the predicted masks into binary values.
+	 */
+	maskThreshold?: number;
+	/**
+	 * Mask overlap threshold to eliminate small, disconnected segments.
+	 */
+	overlapMaskAreaThreshold?: number;
+	/**
+	 * Segmentation task to be performed, depending on model capabilities.
+	 */
+	subtask?: Subtask;
+	/**
+	 * Probability threshold to filter out predicted masks.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
 }
 
 export type Subtask = "instance" | "panoptic" | "semantic";
@@ -54,13 +52,13 @@ export type Subtask = "instance" | "panoptic" | "semantic";
  * A predicted mask / segment
  */
 export interface ImageSegmentationOutput {
-    /**
-     * The label of the predicted segment
-     */
-    label: string;
-    /**
-     * The corresponding mask as a black-and-white image
-     */
-    mask: unknown;
-    [property: string]: unknown;
+	/**
+	 * The label of the predicted segment
+	 */
+	label: string;
+	/**
+	 * The corresponding mask as a black-and-white image
+	 */
+	mask: unknown;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index 6fb0f997e7..c1f1a5cb8b 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type ImageToImageOutput = unknown[];
 
 /**
  * Inputs for Image To Image inference
  */
 export interface ImageToImageInput {
-    /**
-     * One or more images to generate images from
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: unknown;
-    [property: string]: unknown;
+	/**
+	 * One or more images to generate images from
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: unknown;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 12e0d49689..029db76daa 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image To Text inference
  */
 export interface ImageToTextInput {
-    /**
-     * One or several images to generated text for
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageToTextParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several images to generated text for
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToTextParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,20 +25,20 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
-    /**
-     * The amount of maximum tokens to generate.
-     */
-    maxNewTokens?: number;
-    [property: string]: unknown;
+	/**
+	 * The amount of maximum tokens to generate.
+	 */
+	maxNewTokens?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image To Text task
  */
 export interface ImageToTextOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index d294110f7c..228063fc05 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Object Detection inference
  */
 export interface ObjectDetectionInput {
-    /**
-     * One or several input images to perform object detection on
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ObjectDetectionParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several input images to perform object detection on
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ObjectDetectionParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,31 +25,31 @@ export interface ObjectDetectionInput {
  * Additional inference parameters for Object Detection
  */
 export interface ObjectDetectionParameters {
-    /**
-     * The probability necessary to make a prediction.
-     */
-    threshold?: number;
-    [property: string]: unknown;
+	/**
+	 * The probability necessary to make a prediction.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Object Detection task
  */
 export interface ObjectDetectionOutput {
-    /**
-     * The predicted bounding box. Coordinates are relative to the top left corner of the input
-     * image.
-     */
-    box: Box;
-    /**
-     * The predicted label for the bounding box
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: Box;
+	/**
+	 * The predicted label for the bounding box
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -59,9 +57,9 @@ export interface ObjectDetectionOutput {
  * image.
  */
 export interface Box {
-    xmax: number;
-    xmin: number;
-    ymax: number;
-    ymin: number;
-    [property: string]: unknown;
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 75293984d8..58da43f6d1 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Question Answering inference
  */
 export interface QuestionAnsweringInput {
-    /**
-     * One or several question+context pairs to answer
-     */
-    inputs: SquadExample[] | SquadExample;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: QuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several question+context pairs to answer
+	 */
+	inputs: SquadExample[] | SquadExample;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: QuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 export interface SquadExample {
-    /**
-     * The context to be used for answering the question
-     */
-    context: string;
-    /**
-     * The question to be answered
-     */
-    question: string;
-    [property: string]: unknown;
+	/**
+	 * The context to be used for answering the question
+	 */
+	context: string;
+	/**
+	 * The question to be answered
+	 */
+	question: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,63 +37,63 @@ export interface SquadExample {
  * Additional inference parameters for Question Answering
  */
 export interface QuestionAnsweringParameters {
-    /**
-     * Attempts to align the answer to real words. Improves quality on space separated
-     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
-     */
-    alignToWords?: boolean;
-    /**
-     * If the context is too long to fit with the question for the model, it will be split in
-     * several chunks with some overlap. This argument controls the size of that overlap.
-     */
-    docStride?: number;
-    /**
-     * Whether to accept impossible as an answer.
-     */
-    handleImpossibleAnswer?: boolean;
-    /**
-     * The maximum length of predicted answers (e.g., only answers with a shorter length are
-     * considered).
-     */
-    maxAnswerLen?: number;
-    /**
-     * The maximum length of the question after tokenization. It will be truncated if needed.
-     */
-    maxQuestionLen?: number;
-    /**
-     * The maximum length of the total sentence (context + question) in tokens of each chunk
-     * passed to the model. The context will be split in several chunks (using docStride as
-     * overlap) if needed.
-     */
-    maxSeqLen?: number;
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Note that we
-     * return less than topk answers if there are not enough options available within the
-     * context.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * Attempts to align the answer to real words. Improves quality on space separated
+	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+	 */
+	alignToWords?: boolean;
+	/**
+	 * If the context is too long to fit with the question for the model, it will be split in
+	 * several chunks with some overlap. This argument controls the size of that overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer.
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using docStride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Question Answering task
  */
 export interface QuestionAnsweringOutput {
-    /**
-     * The answer to the question.
-     */
-    answer: string;
-    /**
-     * The character position in the input where the answer ends.
-     */
-    end: number;
-    /**
-     * The probability associated to the answer.
-     */
-    score: number;
-    /**
-     * The character position in the input where the answer begins.
-     */
-    start: number;
-    [property: string]: unknown;
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	/**
+	 * The character position in the input where the answer ends.
+	 */
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	/**
+	 * The character position in the input where the answer begins.
+	 */
+	start: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 40976f099a..252326caf1 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -1,34 +1,32 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type SentenceSimilarityOutput = number[];
 
 /**
  * Inputs for Sentence similarity inference
  */
 export interface SentenceSimilarityInput {
-    inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 export interface SentenceSimilarityInputSingle {
-    /**
-     * A list of strings which will be compared against the source_sentence.
-     */
-    sentences: string[];
-    /**
-     * The string that you wish to compare the other strings with. This can be a phrase,
-     * sentence, or longer passage, depending on the model being used.
-     */
-    sourceSentence: string;
-    [property: string]: unknown;
+	/**
+	 * A list of strings which will be compared against the source_sentence.
+	 */
+	sentences: string[];
+	/**
+	 * The string that you wish to compare the other strings with. This can be a phrase,
+	 * sentence, or longer passage, depending on the model being used.
+	 */
+	sourceSentence: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index 6169e942bd..4a2fd40a55 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Summarization inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface SummarizationInput {
-    /**
-     * One or more texts to use for text2text generation
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,19 +27,19 @@ export interface SummarizationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    parameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Truncation;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	parameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -52,9 +50,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 22e6b88328..35b172a5da 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,61 +1,59 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Table Question Answering inference
  */
 export interface TableQuestionAnsweringInput {
-    /**
-     * One or several questions about a table
-     */
-    inputs: Inputs;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several questions about a table
+	 */
+	inputs: Inputs;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * One or several questions about a table
  */
 export interface Inputs {
-    /**
-     * One or several questions to be answered about the table
-     */
-    question?: string[] | string;
-    /**
-     * The table to serve as context for the questions
-     */
-    table?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several questions to be answered about the table
+	 */
+	question?: string[] | string;
+	/**
+	 * The table to serve as context for the questions
+	 */
+	table?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Table Question Answering task
  */
 export interface TableQuestionAnsweringOutput {
-    /**
-     * If the model has an aggregator, this returns the aggregator.
-     */
-    aggregator?: string;
-    /**
-     * The answer of the question given the table. If there is an aggregator, the answer will be
-     * preceded by `AGGREGATOR >`.
-     */
-    answer: string;
-    /**
-     * List of strings made up of the answer cell values.
-     */
-    cells: string[];
-    /**
-     * Coordinates of the cells of the answers.
-     */
-    coordinates: Array<number[]>;
-    [property: string]: unknown;
+	/**
+	 * If the model has an aggregator, this returns the aggregator.
+	 */
+	aggregator?: string;
+	/**
+	 * The answer of the question given the table. If there is an aggregator, the answer will be
+	 * preceded by `AGGREGATOR >`.
+	 */
+	answer: string;
+	/**
+	 * List of strings made up of the answer cell values.
+	 */
+	cells: string[];
+	/**
+	 * Coordinates of the cells of the answers.
+	 */
+	coordinates: Array<number[]>;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index c490ee94d2..33648fdd88 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Classification inference
  */
 export interface TextClassificationInput {
-    /**
-     * One or several texts to classify
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several texts to classify
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,15 +25,15 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: FunctionToApply;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: FunctionToApply;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type FunctionToApply = "sigmoid" | "softmax" | "none";
@@ -44,13 +42,13 @@ export type FunctionToApply = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index b28b1f2255..62af0a9c59 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-    /**
-     * The text to initialize generation with
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The text to initialize generation with
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,63 +25,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-    /**
-     * Whether to use logit sampling (true) or greedy search (false).
-     */
-    doSample?: boolean;
-    /**
-     * Maximum number of generated tokens.
-     */
-    maxNewTokens?: number;
-    /**
-     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-     * paper](https://hf.co/papers/1909.05858) for more details.
-     */
-    repetitionPenalty?: number;
-    /**
-     * Whether to prepend the prompt to the generated text.
-     */
-    returnFullText?: boolean;
-    /**
-     * Stop generating tokens if a member of `stop_sequences` is generated.
-     */
-    stopSequences?: string[];
-    /**
-     * The value used to modulate the logits distribution.
-     */
-    temperature?: number;
-    /**
-     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-     */
-    topK?: number;
-    /**
-     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-     * up to `top_p` or higher are kept for generation.
-     */
-    topP?: number;
-    /**
-     * Truncate input tokens to the given size.
-     */
-    truncate?: number;
-    /**
-     * Typical Decoding mass. See [Typical Decoding for Natural Language
-     * Generation](https://hf.co/papers/2202.00666) for more information
-     */
-    typicalP?: number;
-    /**
-     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-     */
-    watermark?: boolean;
-    [property: string]: unknown;
+	/**
+	 * Whether to use logit sampling (true) or greedy search (false).
+	 */
+	doSample?: boolean;
+	/**
+	 * Maximum number of generated tokens.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetitionPenalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	returnFullText?: boolean;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stopSequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typicalP?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-    /**
-     * The generated text
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index f263ba8345..3b23948a36 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,37 +1,35 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text To Audio inference
  */
 export interface TextToAudioInput {
-    /**
-     * One or several texts to generate audio for
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several texts to generate audio for
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToAudioOutput {
-    /**
-     * The generated audio waveform.
-     */
-    audio: unknown;
-    /**
-     * The sampling rate of the generated audio waveform.
-     */
-    samplingRate: number;
-    [property: string]: unknown;
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index ca08be0050..766b23a382 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text to Speech inference
  *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
-    /**
-     * One or several texts to generate audio for
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several texts to generate audio for
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
@@ -29,13 +27,13 @@ export interface TextToSpeechInput {
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
-    /**
-     * The generated audio waveform.
-     */
-    audio: unknown;
-    /**
-     * The sampling rate of the generated audio waveform.
-     */
-    samplingRate: number;
-    [property: string]: unknown;
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 38aaf3ab2b..5241648d36 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text2text Generation inference
  */
 export interface Text2TextGenerationInput {
-    /**
-     * One or more texts to use for text2text generation
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,19 +25,19 @@ export interface Text2TextGenerationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    parameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Truncation;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	parameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -48,9 +46,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index 7b82c12c63..b6ec0e9b2a 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Token Classification inference
  */
 export interface TokenClassificationInput {
-    /**
-     * One or several texts which tokens are to be classified
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TokenClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several texts which tokens are to be classified
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TokenClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,19 +25,19 @@ export interface TokenClassificationInput {
  * Additional inference parameters for Token Classification
  */
 export interface TokenClassificationParameters {
-    /**
-     * The strategy used to fuse tokens based on model predictions
-     */
-    aggregationStrategy?: AggregationStrategy;
-    /**
-     * A list of labels to ignore
-     */
-    ignoreLabels?: string[];
-    /**
-     * The number of overlapping tokens between chunks when splitting the input text.
-     */
-    stride?: number;
-    [property: string]: unknown;
+	/**
+	 * The strategy used to fuse tokens based on model predictions
+	 */
+	aggregationStrategy?: AggregationStrategy;
+	/**
+	 * A list of labels to ignore
+	 */
+	ignoreLabels?: string[];
+	/**
+	 * The number of overlapping tokens between chunks when splitting the input text.
+	 */
+	stride?: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -62,26 +60,26 @@ export type AggregationStrategy = "none" | "simple" | "first" | "average" | "max
  * Outputs of inference for the Token Classification task
  */
 export interface TokenClassificationOutput {
-    /**
-     * The character position in the input where this group ends.
-     */
-    end?: number;
-    /**
-     * The predicted label for that group of tokens
-     */
-    entityGroup?: string;
-    label:        unknown;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    /**
-     * The character position in the input where this group begins.
-     */
-    start?: number;
-    /**
-     * The corresponding text
-     */
-    word?: string;
-    [property: string]: unknown;
+	/**
+	 * The character position in the input where this group ends.
+	 */
+	end?: number;
+	/**
+	 * The predicted label for that group of tokens
+	 */
+	entityGroup?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	/**
+	 * The character position in the input where this group begins.
+	 */
+	start?: number;
+	/**
+	 * The corresponding text
+	 */
+	word?: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 3b059542ac..26786d8e5e 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Translation inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface TranslationInput {
-    /**
-     * One or more texts to use for text2text generation
-     */
-    inputs: string[] | string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or more texts to use for text2text generation
+	 */
+	inputs: string[] | string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,19 +27,19 @@ export interface TranslationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    parameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Truncation;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	parameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Truncation;
+	[property: string]: unknown;
 }
 
 export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -52,9 +50,9 @@ export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "o
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index d98b4300e9..29b3fed095 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Video Classification inference
  */
 export interface VideoClassificationInput {
-    /**
-     * One or several videos to be classified
-     */
-    inputs: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: VideoClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several videos to be classified
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VideoClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,32 +25,32 @@ export interface VideoClassificationInput {
  * Additional inference parameters for Video Classification
  */
 export interface VideoClassificationParameters {
-    /**
-     * The sampling rate used to select frames from the video.
-     */
-    frameSamplingRate?: number;
-    /**
-     * The number of sampled frames to consider for classification.
-     */
-    numFrames?: number;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The sampling rate used to select frames from the video.
+	 */
+	frameSamplingRate?: number;
+	/**
+	 * The number of sampled frames to consider for classification.
+	 */
+	numFrames?: number;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index da28c988e9..3b9070447a 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Visual Question Answering inference
  */
 export interface VisualQuestionAnsweringInput {
-    /**
-     * One or more image-question pairs
-     */
-    inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: VisualQuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One or more image-question pairs
+	 */
+	inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VisualQuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 export interface VisualQuestionAnsweringInputSingle {
-    /**
-     * The image.
-     */
-    image: unknown;
-    /**
-     * The question to answer based on the image.
-     */
-    question: unknown;
-    [property: string]: unknown;
+	/**
+	 * The image.
+	 */
+	image: unknown;
+	/**
+	 * The question to answer based on the image.
+	 */
+	question: unknown;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,27 +37,27 @@ export interface VisualQuestionAnsweringInputSingle {
  * Additional inference parameters for Visual Question Answering
  */
 export interface VisualQuestionAnsweringParameters {
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Note that we
-     * return less than topk answers if there are not enough options available within the
-     * context.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Visual Question Answering task
  */
 export interface VisualQuestionAnsweringOutput {
-    /**
-     * The answer to the question
-     */
-    answer?: string;
-    label:   unknown;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The answer to the question
+	 */
+	answer?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 59b7cc3cd9..564c6ba629 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Classification inference
  */
 export interface ZeroShotClassificationInput {
-    /**
-     * One or several text + candidate labels pairs to classify
-     */
-    inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ZeroShotClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several text + candidate labels pairs to classify
+	 */
+	inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotClassificationParameters;
+	[property: string]: unknown;
 }
 
 export interface ZeroShotClassificationInputSingle {
-    /**
-     * The set of possible class labels to classify the text into.
-     */
-    candidateLabels: string[];
-    /**
-     * The text to classify
-     */
-    text: string;
-    [property: string]: unknown;
+	/**
+	 * The set of possible class labels to classify the text into.
+	 */
+	candidateLabels: string[];
+	/**
+	 * The text to classify
+	 */
+	text: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,31 +37,31 @@ export interface ZeroShotClassificationInputSingle {
  * Additional inference parameters for Zero Shot Classification
  */
 export interface ZeroShotClassificationParameters {
-    /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
-     */
-    hypothesisTemplate?: string;
-    /**
-     * Whether multiple candidate labels can be true. If false, the scores are normalized such
-     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
-     * considered independent and probabilities are normalized for each candidate.
-     */
-    multiLabel?: boolean;
-    [property: string]: unknown;
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	/**
+	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
+	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+	 * considered independent and probabilities are normalized for each candidate.
+	 */
+	multiLabel?: boolean;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
 export interface ZeroShotClassificationOutput {
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 38aafb6a10..0976094a45 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,36 +1,34 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Image Classification inference
  */
 export interface ZeroShotImageClassificationInput {
-    /**
-     * One or several images to classify
-     */
-    inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ZeroShotImageClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * One or several images to classify
+	 */
+	inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotImageClassificationParameters;
+	[property: string]: unknown;
 }
 
 export interface ZeroShotImageClassificationInputSingle {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to classify
-     */
-    image: unknown;
-    [property: string]: unknown;
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to classify
+	 */
+	image: unknown;
+	[property: string]: unknown;
 }
 
 /**
@@ -39,25 +37,25 @@ export interface ZeroShotImageClassificationInputSingle {
  * Additional inference parameters for Zero Shot Image Classification
  */
 export interface ZeroShotImageClassificationParameters {
-    /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
-     */
-    hypothesisTemplate?: string;
-    [property: string]: unknown;
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
 export interface ZeroShotImageClassificationOutput {
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index e9ef360bf4..de136d6c3a 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,56 +1,54 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Object Detection inference
  */
 export interface ZeroShotObjectDetectionInput {
-    /**
-     * One or several images to perform object detection on
-     */
-    inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One or several images to perform object detection on
+	 */
+	inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 export interface ZeroShotObjectDetectionInputSingle {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to generate bounding boxes from
-     */
-    image: unknown;
-    [property: string]: unknown;
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: unknown;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Object Detection task
  */
 export interface ZeroShotObjectDetectionOutput {
-    /**
-     * The predicted bounding box. Coordinates are relative to the top left corner of the input
-     * image.
-     */
-    box: Box;
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: Box;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -58,9 +56,9 @@ export interface ZeroShotObjectDetectionOutput {
  * image.
  */
 export interface Box {
-    xmax: number;
-    xmin: number;
-    ymax: number;
-    ymin: number;
-    [property: string]: unknown;
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
 }

From 00501a603e32131b3cb4fbec20325f2c656ea1b6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:09:08 +0100
Subject: [PATCH 23/51] Add canonicalId to TaskData

---
 packages/tasks/src/tasks/index.ts               | 1 +
 packages/tasks/src/tasks/summarization/data.ts  | 1 +
 packages/tasks/src/tasks/text-to-speech/data.ts | 1 +
 packages/tasks/src/tasks/translation/data.ts    | 1 +
 4 files changed, 4 insertions(+)

diff --git a/packages/tasks/src/tasks/index.ts b/packages/tasks/src/tasks/index.ts
index b0615dfeb3..9e94253763 100644
--- a/packages/tasks/src/tasks/index.ts
+++ b/packages/tasks/src/tasks/index.ts
@@ -216,6 +216,7 @@ export interface TaskData {
 	datasets: ExampleRepo[];
 	demo: TaskDemo;
 	id: PipelineType;
+	canonicalId?: PipelineType;
 	isPlaceholder?: boolean;
 	label: string;
 	libraries: ModelLibraryKey[];
diff --git a/packages/tasks/src/tasks/summarization/data.ts b/packages/tasks/src/tasks/summarization/data.ts
index b13fa3d163..bd04453da3 100644
--- a/packages/tasks/src/tasks/summarization/data.ts
+++ b/packages/tasks/src/tasks/summarization/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text2text-generation",
 	datasets: [
 		{
 			description:
diff --git a/packages/tasks/src/tasks/text-to-speech/data.ts b/packages/tasks/src/tasks/text-to-speech/data.ts
index 73560b7afc..26c6f48371 100644
--- a/packages/tasks/src/tasks/text-to-speech/data.ts
+++ b/packages/tasks/src/tasks/text-to-speech/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text-to-audio",
 	datasets: [
 		{
 			description: "Thousands of short audio clips of a single speaker.",
diff --git a/packages/tasks/src/tasks/translation/data.ts b/packages/tasks/src/tasks/translation/data.ts
index c0e4c3a340..0edfab7b88 100644
--- a/packages/tasks/src/tasks/translation/data.ts
+++ b/packages/tasks/src/tasks/translation/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text2text-generation",
 	datasets: [
 		{
 			description: "A dataset of copyright-free books translated into 16 different languages.",

From 29fecc059b00ef9ceb5bb50c6f638a8b3373515b Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:13:32 +0100
Subject: [PATCH 24/51] Fix naming for bounding boxes types

---
 packages/tasks/src/tasks/object-detection/inference.ts        | 4 ++--
 packages/tasks/src/tasks/object-detection/spec/output.json    | 1 +
 .../tasks/src/tasks/zero-shot-object-detection/inference.ts   | 4 ++--
 .../src/tasks/zero-shot-object-detection/spec/output.json     | 1 +
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 228063fc05..5675eb53a2 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -40,7 +40,7 @@ export interface ObjectDetectionOutput {
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
 	 */
-	box: Box;
+	box: BoundingBox;
 	/**
 	 * The predicted label for the bounding box
 	 */
@@ -56,7 +56,7 @@ export interface ObjectDetectionOutput {
  * The predicted bounding box. Coordinates are relative to the top left corner of the input
  * image.
  */
-export interface Box {
+export interface BoundingBox {
 	xmax: number;
 	xmin: number;
 	ymax: number;
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
index 41d0ed887d..450d96ed28 100644
--- a/packages/tasks/src/tasks/object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -24,6 +24,7 @@
 	"$defs": {
 		"BoundingBox": {
 			"type": "object",
+			"title": "BoundingBox",
 			"properties": {
 				"xmin": {
 					"type": "integer"
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index de136d6c3a..6493541d82 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -39,7 +39,7 @@ export interface ZeroShotObjectDetectionOutput {
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
 	 */
-	box: Box;
+	box: BoundingBox;
 	/**
 	 * A candidate label
 	 */
@@ -55,7 +55,7 @@ export interface ZeroShotObjectDetectionOutput {
  * The predicted bounding box. Coordinates are relative to the top left corner of the input
  * image.
  */
-export interface Box {
+export interface BoundingBox {
 	xmax: number;
 	xmin: number;
 	ymax: number;
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
index 0e725af9e1..171e81120f 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -23,6 +23,7 @@
 	},
 	"$defs": {
 		"BoundingBox": {
+			"title": "BoundingBox",
 			"type": "object",
 			"properties": {
 				"xmin": {

From d220a9b75938d459a2d385b1fb33f11620a1b4eb Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:19:13 +0100
Subject: [PATCH 25/51] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Better=20names=20for?=
 =?UTF-8?q?=20intermediate=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/tasks/image-segmentation/inference.ts      | 4 ++--
 packages/tasks/src/tasks/image-segmentation/spec/input.json   | 1 +
 packages/tasks/src/tasks/summarization/inference.ts           | 4 ++--
 packages/tasks/src/tasks/text-classification/inference.ts     | 4 ++--
 packages/tasks/src/tasks/text-classification/spec/input.json  | 1 +
 packages/tasks/src/tasks/text2text-generation/inference.ts    | 4 ++--
 packages/tasks/src/tasks/text2text-generation/spec/input.json | 1 +
 packages/tasks/src/tasks/token-classification/inference.ts    | 4 ++--
 packages/tasks/src/tasks/token-classification/spec/input.json | 1 +
 packages/tasks/src/tasks/translation/inference.ts             | 4 ++--
 10 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 13c15cb72f..8b5e6da56b 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -36,7 +36,7 @@ export interface ImageSegmentationParameters {
 	/**
 	 * Segmentation task to be performed, depending on model capabilities.
 	 */
-	subtask?: Subtask;
+	subtask?: ImageSegmentationSubtask;
 	/**
 	 * Probability threshold to filter out predicted masks.
 	 */
@@ -44,7 +44,7 @@ export interface ImageSegmentationParameters {
 	[property: string]: unknown;
 }
 
-export type Subtask = "instance" | "panoptic" | "semantic";
+export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
 
 /**
  * Outputs of inference for the Image Segmentation task
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 5e050b8c7a..06a80028b7 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -27,6 +27,7 @@
 					"description": "Mask overlap threshold to eliminate small, disconnected segments."
 				},
 				"subtask": {
+					"title": "ImageSegmentationSubtask",
 					"type": "string",
 					"description": "Segmentation task to be performed, depending on model capabilities.",
 					"oneOf": [
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index 4a2fd40a55..d38632cd4f 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -38,11 +38,11 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * The truncation strategy to use
 	 */
-	truncation?: Truncation;
+	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
 
-export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 
 /**
  * Outputs for Summarization inference
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 33648fdd88..58a54af0c4 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -28,7 +28,7 @@ export interface TextClassificationParameters {
 	/**
 	 * The function to apply to the model outputs in order to retrieve the scores.
 	 */
-	functionToApply?: FunctionToApply;
+	functionToApply?: TextClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
@@ -36,7 +36,7 @@ export interface TextClassificationParameters {
 	[property: string]: unknown;
 }
 
-export type FunctionToApply = "sigmoid" | "softmax" | "none";
+export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 
 /**
  * Outputs of inference for the Text Classification task
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index af40fea2eb..73b14c794c 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -30,6 +30,7 @@
 			"type": "object",
 			"properties": {
 				"functionToApply": {
+					"title": "TextClassificationOutputTransform",
 					"type": "string",
 					"description": "The function to apply to the model outputs in order to retrieve the scores.",
 					"oneOf": [
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 5241648d36..7d2c7182e3 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -36,11 +36,11 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * The truncation strategy to use
 	 */
-	truncation?: Truncation;
+	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
 
-export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 
 /**
  * Outputs of inference for the Text2text Generation task
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index bec8fedfc2..e8a0b9cd08 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -34,6 +34,7 @@
 					"description": "Whether to clean up the potential extra spaces in the text output."
 				},
 				"truncation": {
+					"title": "Text2textGenerationTruncationStrategy",
 					"type": "string",
 					"description": "The truncation strategy to use",
 					"oneOf": [
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index b6ec0e9b2a..fa18ba34b8 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -28,7 +28,7 @@ export interface TokenClassificationParameters {
 	/**
 	 * The strategy used to fuse tokens based on model predictions
 	 */
-	aggregationStrategy?: AggregationStrategy;
+	aggregationStrategy?: TokenClassificationAggregationStrategy;
 	/**
 	 * A list of labels to ignore
 	 */
@@ -54,7 +54,7 @@ export interface TokenClassificationParameters {
  * Similar to "simple", also preserves word integrity (uses the label with the highest score
  * across the word's tokens).
  */
-export type AggregationStrategy = "none" | "simple" | "first" | "average" | "max";
+export type TokenClassificationAggregationStrategy = "none" | "simple" | "first" | "average" | "max";
 
 /**
  * Outputs of inference for the Token Classification task
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index 8ca4b07d33..a2fcf5fdf2 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -41,6 +41,7 @@
 					"description": "The number of overlapping tokens between chunks when splitting the input text."
 				},
 				"aggregationStrategy": {
+					"title": "TokenClassificationAggregationStrategy",
 					"type": "string",
 					"description": "The strategy used to fuse tokens based on model predictions",
 					"oneOf": [
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 26786d8e5e..ecb1082879 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -38,11 +38,11 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * The truncation strategy to use
 	 */
-	truncation?: Truncation;
+	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
 
-export type Truncation = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
 
 /**
  * Outputs for Translation inference

From 49a1d5027c24f7c022bdf3d9cdf4d6a55301809b Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 11:26:12 +0100
Subject: [PATCH 26/51] =?UTF-8?q?=E2=9C=A8=20Update=20placeholder?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/tasks/placeholder/data.ts  |  3 ++
 .../src/tasks/placeholder/spec/input.json     | 44 +++++++++++++++++++
 .../src/tasks/placeholder/spec/output.json    | 16 +++++++
 3 files changed, 63 insertions(+)
 create mode 100644 packages/tasks/src/tasks/placeholder/spec/input.json
 create mode 100644 packages/tasks/src/tasks/placeholder/spec/output.json

diff --git a/packages/tasks/src/tasks/placeholder/data.ts b/packages/tasks/src/tasks/placeholder/data.ts
index 0cbc735ad9..110b43703e 100644
--- a/packages/tasks/src/tasks/placeholder/data.ts
+++ b/packages/tasks/src/tasks/placeholder/data.ts
@@ -13,6 +13,9 @@ const taskData: TaskDataCustom = {
 	summary: "",
 	widgetModels: [],
 	youtubeId: undefined,
+	/// If this is a subtask, link to the most general task ID
+	/// (eg, text2text-generation is the canonical ID of translation)
+	canonicalId: undefined,
 };
 
 export default taskData;
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
new file mode 100644
index 0000000000..ad61eb7ae1
--- /dev/null
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -0,0 +1,44 @@
+{
+	"$id": "/inference/schemas/<TASK_ID>/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for <TASK_ID> inference",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "TODO: describe the input here. This must be model & framework agnostic.",
+			"anyOf": [
+				{
+					"type": "string"
+				},
+				{
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/<TASK_ID>Parameters"
+		}
+	},
+	"$defs": {
+		"<TASK_ID>Parameters": {
+			"title": "<TASK_ID>Parameters",
+			"description": "TODO: describe additional parameters here.",
+			"type": "object",
+			"properties": {
+				"dummyParameterName": {
+					"type": "boolean",
+					"description": "TODO: describe the parameter here"
+				},
+				"dummyParameterName2": {
+					"type": "integer",
+					"description": "TODO: describe the parameter here"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}
diff --git a/packages/tasks/src/tasks/placeholder/spec/output.json b/packages/tasks/src/tasks/placeholder/spec/output.json
new file mode 100644
index 0000000000..b4b4225f63
--- /dev/null
+++ b/packages/tasks/src/tasks/placeholder/spec/output.json
@@ -0,0 +1,16 @@
+{
+	"$id": "/inference/schemas/<TASK_ID>/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for <TASK_ID> inference",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"meaningfulOutputName": {
+				"type": "string",
+				"description": "TODO: Describe what is outputed by the inference here"
+			}
+		},
+		"required": ["meaningfulOutputName"]
+	}
+}

From f4784bf7436e963f3a94cab1b501ea76c191123a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 19:48:09 +0100
Subject: [PATCH 27/51] Changes from code review

---
 .../src/tasks/audio-classification/inference.ts  |  2 +-
 .../tasks/audio-classification/spec/input.json   |  2 +-
 .../document-question-answering/inference.ts     |  2 +-
 .../document-question-answering/spec/input.json  |  2 +-
 .../src/tasks/image-classification/inference.ts  |  6 ++++++
 .../tasks/image-classification/spec/input.json   | 16 ++++++++++++++++
 .../src/tasks/image-segmentation/inference.ts    |  4 ++++
 .../tasks/image-segmentation/spec/output.json    |  4 ++++
 8 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 2ac2e50656..9a108b61d4 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -9,7 +9,7 @@
  */
 export interface AudioClassificationInput {
 	/**
-	 * On or several audio files to classify
+	 * One or several audio files to classify
 	 */
 	inputs: unknown;
 	/**
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 29357710d4..685e92a0f7 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -5,7 +5,7 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "On or several audio files to classify"
+			"description": "One or several audio files to classify"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 8dec0976f2..d9f01c50fa 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -9,7 +9,7 @@
  */
 export interface DocumentQuestionAnsweringInput {
 	/**
-	 * The
+	 * One or several document+question pairs to answer
 	 */
 	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
 	/**
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 86d0708c58..394182f436 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -5,7 +5,7 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The ",
+			"description": "One or several document+question pairs to answer",
 			"anyOf": [
 				{
 					"$ref": "#/$defs/DocumentAndQuestion"
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index dfff0cfd9b..92018d69f9 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -25,6 +25,10 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: ImageClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
@@ -32,6 +36,8 @@ export interface ImageClassificationParameters {
 	[property: string]: unknown;
 }
 
+export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+
 /**
  * Outputs of inference for the Image Classification task
  */
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 875fae0e0e..ecd23443d5 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -18,6 +18,22 @@
 			"description": "Additional inference parameters for Image Classification",
 			"type": "object",
 			"properties": {
+				"functionToApply": {
+					"title": "ImageClassificationOutputTransform",
+					"type": "string",
+					"description": "The function to apply to the model outputs in order to retrieve the scores.",
+					"oneOf": [
+						{
+							"const": "sigmoid"
+						},
+						{
+							"const": "softmax"
+						},
+						{
+							"const": "none"
+						}
+					]
+				},
 				"topK": {
 					"type": "integer",
 					"description": "When specified, limits the output to the top K most probable classes."
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 8b5e6da56b..5cd1af00f8 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -60,5 +60,9 @@ export interface ImageSegmentationOutput {
 	 * The corresponding mask as a black-and-white image
 	 */
 	mask: unknown;
+	/**
+	 * The score or confidence degreee the model has
+	 */
+	score?: number;
 	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
index 80db732e3e..4b7cb643c8 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/output.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -13,6 +13,10 @@
 			},
 			"mask": {
 				"description": "The corresponding mask as a black-and-white image"
+			},
+			"score": {
+				"type": "number",
+				"description": "The score or confidence degreee the model has"
 			}
 		},
 		"required": ["label", "mask"]

From a33987fe219616a75c0e9fca79a65164472a9dd0 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Tue, 23 Jan 2024 19:50:42 +0100
Subject: [PATCH 28/51] mark image & question as required in doc QA

---
 .../tasks/src/tasks/document-question-answering/inference.ts  | 4 ++--
 .../src/tasks/document-question-answering/spec/input.json     | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index d9f01c50fa..092268360f 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -23,11 +23,11 @@ export interface DocumentQuestionAnsweringInpu {
 	/**
 	 * The image on which the question is asked
 	 */
-	image?: unknown;
+	image: unknown;
 	/**
 	 * A question to ask of the document
 	 */
-	question?: string;
+	question: string;
 	[property: string]: unknown;
 }
 
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 394182f436..753790c93e 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -88,7 +88,8 @@
 					"type": "string",
 					"description": "A question to ask of the document"
 				}
-			}
+			},
+			"required": ["image", "question"]
 		}
 	},
 	"required": ["inputs"]

From 6558af4db61b144ab2d521732b473abb31011b0a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Wed, 24 Jan 2024 19:26:53 +0100
Subject: [PATCH 29/51] Document QA: rename input element to inputsingle

---
 .../src/tasks/document-question-answering/inference.ts     | 4 ++--
 .../src/tasks/document-question-answering/spec/input.json  | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 092268360f..f8963c0bde 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -11,7 +11,7 @@ export interface DocumentQuestionAnsweringInput {
 	/**
 	 * One or several document+question pairs to answer
 	 */
-	inputs: DocumentQuestionAnsweringInpu[] | DocumentQuestionAnsweringInpu;
+	inputs: DocumentQuestionAnsweringInputSingle[] | DocumentQuestionAnsweringInputSingle;
 	/**
 	 * Additional inference parameters
 	 */
@@ -19,7 +19,7 @@ export interface DocumentQuestionAnsweringInput {
 	[property: string]: unknown;
 }
 
-export interface DocumentQuestionAnsweringInpu {
+export interface DocumentQuestionAnsweringInputSingle {
 	/**
 	 * The image on which the question is asked
 	 */
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 753790c93e..84d286e231 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -8,12 +8,12 @@
 			"description": "One or several document+question pairs to answer",
 			"anyOf": [
 				{
-					"$ref": "#/$defs/DocumentAndQuestion"
+					"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
 				},
 				{
 					"type": "array",
 					"items": {
-						"$ref": "#/$defs/DocumentAndQuestion"
+						"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
 					}
 				}
 			]
@@ -78,8 +78,9 @@
 				}
 			}
 		},
-		"DocumentAndQuestion": {
+		"DocumentQuestionAnsweringInputSingle": {
 			"type": "object",
+			"title": "DocumentQuestionAnsweringInputSingle",
 			"properties": {
 				"image": {
 					"description": "The image on which the question is asked"

From 0724e261fe94dea2676240d7fcef55a955902dbc Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 25 Jan 2024 11:24:42 +0100
Subject: [PATCH 30/51] No batching

---
 .vscode/settings.json                         |   5 +
 .../tasks/audio-classification/inference.ts   |  56 ++++---
 .../audio-classification/spec/input.json      |  22 ++-
 .../automatic-speech-recognition/inference.ts |  32 ++--
 .../spec/input.json                           |   4 +-
 .../src/tasks/depth-estimation/inference.ts   |  32 ++--
 .../tasks/depth-estimation/spec/input.json    |   4 +-
 .../document-question-answering/inference.ts  | 153 +++++++++---------
 .../spec/input.json                           |  18 +--
 .../src/tasks/feature-extraction/inference.ts |  24 +--
 .../tasks/feature-extraction/spec/input.json  |  22 +--
 .../tasks/src/tasks/fill-mask/inference.ts    |  80 ++++-----
 .../tasks/src/tasks/fill-mask/spec/input.json |  35 ++--
 .../tasks/image-classification/inference.ts   |  58 +++----
 .../image-classification/spec/input.json      |   6 +-
 .../src/tasks/image-segmentation/inference.ts |  82 +++++-----
 .../tasks/image-segmentation/spec/input.json  |   6 +-
 .../src/tasks/image-to-image/inference.ts     |  22 +--
 .../src/tasks/image-to-image/spec/input.json  |   6 +-
 .../src/tasks/image-to-text/inference.ts      |  42 ++---
 .../src/tasks/image-to-text/spec/input.json   |   6 +-
 .../src/tasks/object-detection/inference.ts   |  70 ++++----
 .../tasks/object-detection/spec/input.json    |   6 +-
 .../src/tasks/placeholder/spec/input.json     |   4 +-
 .../src/tasks/question-answering/inference.ts | 151 ++++++++---------
 .../tasks/question-answering/spec/input.json  |  40 ++---
 .../tasks/sentence-similarity/inference.ts    |  38 ++---
 .../tasks/sentence-similarity/spec/input.json |  31 ++--
 .../src/tasks/summarization/inference.ts      |  58 +++----
 .../table-question-answering/inference.ts     |  80 ++++-----
 .../table-question-answering/spec/input.json  |  23 +--
 .../tasks/text-classification/inference.ts    |  58 +++----
 .../tasks/text-classification/spec/input.json |  18 +--
 .../src/tasks/text-generation/inference.ts    | 128 +++++++--------
 .../src/tasks/text-generation/spec/input.json |  16 +-
 .../src/tasks/text-to-audio/inference.ts      |  40 ++---
 .../src/tasks/text-to-audio/spec/input.json   |  18 +--
 .../src/tasks/text-to-speech/inference.ts     |  40 ++---
 .../tasks/text2text-generation/inference.ts   |  58 +++----
 .../text2text-generation/spec/input.json      |  20 +--
 .../tasks/token-classification/inference.ts   |  92 +++++------
 .../token-classification/spec/input.json      |  18 +--
 .../tasks/src/tasks/translation/inference.ts  |  58 +++----
 .../tasks/video-classification/inference.ts   |  72 +++++----
 .../video-classification/spec/input.json      |  20 ++-
 .../visual-question-answering/inference.ts    |  77 ++++-----
 .../visual-question-answering/spec/input.json |  31 ++--
 .../zero-shot-classification/inference.ts     |  87 +++++-----
 .../zero-shot-classification/spec/input.json  |  33 ++--
 .../inference.ts                              |  73 +++++----
 .../spec/input.json                           |  31 ++--
 .../zero-shot-object-detection/inference.ts   |  81 +++++-----
 .../spec/input.json                           |  31 ++--
 53 files changed, 1131 insertions(+), 1185 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 5e40510b2f..072ae9648a 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -6,5 +6,10 @@
 	},
 	"[svelte]": {
 		"editor.defaultFormatter": "esbenp.prettier-vscode"
+	},
+	"prettier.configPath": ".prettierrc",
+	"json.format.enable": false,
+	"[json]": {
+		"editor.defaultFormatter": "esbenp.prettier-vscode"
 	}
 }
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 9a108b61d4..1ba43fae10 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-	/**
-	 * One or several audio files to classify
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: AudioClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input audio data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: AudioClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,24 +27,30 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: AudioClassificationOutputTransform;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
+export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+
 /**
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 685e92a0f7..60062756cb 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Audio Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several audio files to classify"
+		"input": {
+			"description": "The input audio data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -18,6 +18,22 @@
 			"description": "Additional inference parameters for Audio Classification",
 			"type": "object",
 			"properties": {
+				"functionToApply": {
+					"title": "AudioClassificationOutputTransform",
+					"type": "string",
+					"description": "The function to apply to the model outputs in order to retrieve the scores.",
+					"oneOf": [
+						{
+							"const": "sigmoid"
+						},
+						{
+							"const": "softmax"
+						},
+						{
+							"const": "none"
+						}
+					]
+				},
 				"topK": {
 					"type": "integer",
 					"description": "When specified, limits the output to the top K most probable classes."
@@ -25,5 +41,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 6eb20d0c16..7ddfe0055e 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,31 +1,33 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Automatic Speech Recognition inference
  */
 export interface AutomaticSpeechRecognitionInput {
-	/**
-	 * The input audio data
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The input audio data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutput {
-	/**
-	 * The recognized text.
-	 */
-	text: string;
-	[property: string]: unknown;
+    /**
+     * The recognized text.
+     */
+    text: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index a4034b5e14..147851e579 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Automatic Speech Recognition inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "The input audio data"
 		},
 		"parameters": {
@@ -20,5 +20,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 48b9d3438e..9019043eb9 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 export type DepthEstimationOutput = unknown[];
 
 /**
  * Inputs for Depth Estimation inference
  */
 export interface DepthEstimationInput {
-	/**
-	 * The input image data
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: DepthEstimationParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DepthEstimationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,9 +29,9 @@ export interface DepthEstimationInput {
  * Additional inference parameters for Depth Estimation
  */
 export interface DepthEstimationParameters {
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
index f33df64448..80f4919746 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/input.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Depth Estimation inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index f8963c0bde..a04b3b39b5 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Document Question Answering inference
  */
 export interface DocumentQuestionAnsweringInput {
-	/**
-	 * One or several document+question pairs to answer
-	 */
-	inputs: DocumentQuestionAnsweringInputSingle[] | DocumentQuestionAnsweringInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: DocumentQuestionAnsweringParameters;
-	[property: string]: unknown;
+    /**
+     * One (document, question) pair to answer
+     */
+    input: DocumentQuestionAnsweringInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: DocumentQuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
+/**
+ * One (document, question) pair to answer
+ */
 export interface DocumentQuestionAnsweringInputSingle {
-	/**
-	 * The image on which the question is asked
-	 */
-	image: unknown;
-	/**
-	 * A question to ask of the document
-	 */
-	question: string;
-	[property: string]: unknown;
+    /**
+     * The image on which the question is asked
+     */
+    image: unknown;
+    /**
+     * A question to ask of the document
+     */
+    question: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,65 +42,65 @@ export interface DocumentQuestionAnsweringInputSingle {
  * Additional inference parameters for Document Question Answering
  */
 export interface DocumentQuestionAnsweringParameters {
-	/**
-	 * If the words in the document are too long to fit with the question for the model, it will
-	 * be split in several chunks with some overlap. This argument controls the size of that
-	 * overlap.
-	 */
-	docStride?: number;
-	/**
-	 * Whether to accept impossible as an answer
-	 */
-	handleImpossibleAnswer?: boolean;
-	/**
-	 * Language to use while running OCR. Defaults to english.
-	 */
-	lang?: string;
-	/**
-	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
-	 * considered).
-	 */
-	maxAnswerLen?: number;
-	/**
-	 * The maximum length of the question after tokenization. It will be truncated if needed.
-	 */
-	maxQuestionLen?: number;
-	/**
-	 * The maximum length of the total sentence (context + question) in tokens of each chunk
-	 * passed to the model. The context will be split in several chunks (using doc_stride as
-	 * overlap) if needed.
-	 */
-	maxSeqLen?: number;
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Can return less
-	 * than top_k answers if there are not enough options available within the context.
-	 */
-	topK?: number;
-	/**
-	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-	 * skip the OCR step and use the provided bounding boxes instead.
-	 */
-	wordBoxes?: Array<number[] | string>;
-	[property: string]: unknown;
+    /**
+     * If the words in the document are too long to fit with the question for the model, it will
+     * be split in several chunks with some overlap. This argument controls the size of that
+     * overlap.
+     */
+    docStride?: number;
+    /**
+     * Whether to accept impossible as an answer
+     */
+    handleImpossibleAnswer?: boolean;
+    /**
+     * Language to use while running OCR. Defaults to english.
+     */
+    lang?: string;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    maxAnswerLen?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    maxQuestionLen?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using doc_stride as
+     * overlap) if needed.
+     */
+    maxSeqLen?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Can return less
+     * than top_k answers if there are not enough options available within the context.
+     */
+    topK?: number;
+    /**
+     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+     * skip the OCR step and use the provided bounding boxes instead.
+     */
+    wordBoxes?: Array<number[] | string>;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Document Question Answering task
  */
 export interface DocumentQuestionAnsweringOutput {
-	/**
-	 * The answer to the question.
-	 */
-	answer: string;
-	end: number;
-	/**
-	 * The probability associated to the answer.
-	 */
-	score: number;
-	start: number;
-	/**
-	 * The index of each word/box pair that is in the answer
-	 */
-	words: number[];
-	[property: string]: unknown;
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    end:    number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    start: number;
+    /**
+     * The index of each word/box pair that is in the answer
+     */
+    words: number[];
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 84d286e231..580a95c92b 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Document Question Answering inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several document+question pairs to answer",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
-					}
-				}
-			]
+		"input": {
+			"description": "One (document, question) pair to answer",
+			"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -93,5 +83,5 @@
 			"required": ["image", "question"]
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index 5c237d6dd6..d8674f516d 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,20 +1,22 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
- * Inputs for Feature Extraction inference
+ * Inputs for Text Embedding inference
  */
 export interface FeatureExtractionInput {
-	/**
-	 * One or several texts to get the features of
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The text to get the embeddings of
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index 8bf05339a3..e2eadc4e91 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -1,22 +1,12 @@
 {
-	"$id": "/inference/schemas/feature-extraction/input.json",
+	"$id": "/inference/schemas/text-embedding/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Inputs for Feature Extraction inference",
+	"description": "Inputs for Text Embedding inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts to get the features of",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The text to get the embeddings of",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -31,5 +21,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index 097718900f..e01feec2ff 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Fill Mask inference
  */
 export interface FillMaskInput {
-	/**
-	 * One or several texts with masked tokens
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: FillMaskParameters;
-	[property: string]: unknown;
+    /**
+     * The text with masked tokens
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: FillMaskParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,39 +27,39 @@ export interface FillMaskInput {
  * Additional inference parameters for Fill Mask
  */
 export interface FillMaskParameters {
-	/**
-	 * When passed, the model will limit the scores to the passed targets instead of looking up
-	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-	 * tokenized and the first resulting token will be used (with a warning, and that might be
-	 * slower).
-	 */
-	targets?: string[] | string;
-	/**
-	 * When passed, overrides the number of predictions to return.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * When passed, the model will limit the scores to the passed targets instead of looking up
+     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+     * tokenized and the first resulting token will be used (with a warning, and that might be
+     * slower).
+     */
+    targets?: string[];
+    /**
+     * When passed, overrides the number of predictions to return.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Fill Mask task
  */
 export interface FillMaskOutput {
-	/**
-	 * The corresponding probability
-	 */
-	score: number;
-	/**
-	 * The corresponding input with the mask token prediction.
-	 */
-	sequence: string;
-	/**
-	 * The predicted token id (to replace the masked one).
-	 */
-	token: number;
-	/**
-	 * The predicted token (to replace the masked one).
-	 */
-	tokenStr: string;
-	[property: string]: unknown;
+    /**
+     * The corresponding probability
+     */
+    score: number;
+    /**
+     * The corresponding input with the mask token prediction.
+     */
+    sequence: string;
+    /**
+     * The predicted token id (to replace the masked one).
+     */
+    token: number;
+    /**
+     * The predicted token (to replace the masked one).
+     */
+    tokenStr: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
index 6f7402efbe..62f935fe2d 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/input.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Fill Mask inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts with masked tokens",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The text with masked tokens",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -34,21 +24,14 @@
 					"description": "When passed, overrides the number of predictions to return."
 				},
 				"targets": {
-					"anyOf": [
-						{
-							"type": "string"
-						},
-						{
-							"type": "array",
-							"items": {
-								"type": "string"
-							}
-						}
-					],
-					"description": "When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower)."
+					"description": "When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower).",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
 				}
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 92018d69f9..4885312553 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image Classification inference
  */
 export interface ImageClassificationInput {
-	/**
-	 * On or several image files to classify
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,15 +27,15 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: ImageClassificationOutputTransform;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: ImageClassificationOutputTransform;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -42,13 +44,13 @@ export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index ecd23443d5..fec2d9aa00 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Image Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "On or several image files to classify"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -41,5 +41,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 5cd1af00f8..ba31379ef0 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image Segmentation inference
  */
 export interface ImageSegmentationInput {
-	/**
-	 * One or several image files to perform segmentation on
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageSegmentationParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageSegmentationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,23 +27,23 @@ export interface ImageSegmentationInput {
  * Additional inference parameters for Image Segmentation
  */
 export interface ImageSegmentationParameters {
-	/**
-	 * Threshold to use when turning the predicted masks into binary values.
-	 */
-	maskThreshold?: number;
-	/**
-	 * Mask overlap threshold to eliminate small, disconnected segments.
-	 */
-	overlapMaskAreaThreshold?: number;
-	/**
-	 * Segmentation task to be performed, depending on model capabilities.
-	 */
-	subtask?: ImageSegmentationSubtask;
-	/**
-	 * Probability threshold to filter out predicted masks.
-	 */
-	threshold?: number;
-	[property: string]: unknown;
+    /**
+     * Threshold to use when turning the predicted masks into binary values.
+     */
+    maskThreshold?: number;
+    /**
+     * Mask overlap threshold to eliminate small, disconnected segments.
+     */
+    overlapMaskAreaThreshold?: number;
+    /**
+     * Segmentation task to be performed, depending on model capabilities.
+     */
+    subtask?: ImageSegmentationSubtask;
+    /**
+     * Probability threshold to filter out predicted masks.
+     */
+    threshold?: number;
+    [property: string]: unknown;
 }
 
 export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
@@ -52,17 +54,17 @@ export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
  * A predicted mask / segment
  */
 export interface ImageSegmentationOutput {
-	/**
-	 * The label of the predicted segment
-	 */
-	label: string;
-	/**
-	 * The corresponding mask as a black-and-white image
-	 */
-	mask: unknown;
-	/**
-	 * The score or confidence degreee the model has
-	 */
-	score?: number;
-	[property: string]: unknown;
+    /**
+     * The label of the predicted segment
+     */
+    label: string;
+    /**
+     * The corresponding mask as a black-and-white image
+     */
+    mask: unknown;
+    /**
+     * The score or confidence degreee the model has
+     */
+    score?: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 06a80028b7..5e2a115e33 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Image Segmentation inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several image files to perform segmentation on"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -49,5 +49,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index c1f1a5cb8b..6ee8d47ba1 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 export type ImageToImageOutput = unknown[];
 
 /**
  * Inputs for Image To Image inference
  */
 export interface ImageToImageInput {
-	/**
-	 * One or more images to generate images from
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: unknown;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: unknown;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index 38b1202efa..61653d3c3e 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Image To Image inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or more images to generate images from"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -18,5 +18,5 @@
 			"description": "Additional inference parameters for Image To Image"
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 029db76daa..d4430b5c56 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Image To Text inference
  */
 export interface ImageToTextInput {
-	/**
-	 * One or several images to generated text for
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ImageToTextParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ImageToTextParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,20 +27,20 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
-	/**
-	 * The amount of maximum tokens to generate.
-	 */
-	maxNewTokens?: number;
-	[property: string]: unknown;
+    /**
+     * The amount of maximum tokens to generate.
+     */
+    maxNewTokens?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image To Text task
  */
 export interface ImageToTextOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index 140f9e27e6..0ae6331c55 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Image To Text inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several images to generated text for"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 5675eb53a2..0d38adb58a 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Object Detection inference
  */
 export interface ObjectDetectionInput {
-	/**
-	 * One or several input images to perform object detection on
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ObjectDetectionParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ObjectDetectionParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,31 +27,31 @@ export interface ObjectDetectionInput {
  * Additional inference parameters for Object Detection
  */
 export interface ObjectDetectionParameters {
-	/**
-	 * The probability necessary to make a prediction.
-	 */
-	threshold?: number;
-	[property: string]: unknown;
+    /**
+     * The probability necessary to make a prediction.
+     */
+    threshold?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Object Detection task
  */
 export interface ObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: BoundingBox;
-	/**
-	 * The predicted label for the bounding box
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: BoundingBox;
+    /**
+     * The predicted label for the bounding box
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -57,9 +59,9 @@ export interface ObjectDetectionOutput {
  * image.
  */
 export interface BoundingBox {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: unknown;
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index f8647e78a9..5055f2e17f 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Object Detection inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several input images to perform object detection on"
+		"input": {
+			"description": "The input image data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
index ad61eb7ae1..8dc1c32613 100644
--- a/packages/tasks/src/tasks/placeholder/spec/input.json
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for <TASK_ID> inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "TODO: describe the input here. This must be model & framework agnostic.",
 			"anyOf": [
 				{
@@ -40,5 +40,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 58da43f6d1..7c3dd476a2 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Question Answering inference
  */
 export interface QuestionAnsweringInput {
-	/**
-	 * One or several question+context pairs to answer
-	 */
-	inputs: SquadExample[] | SquadExample;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: QuestionAnsweringParameters;
-	[property: string]: unknown;
+    /**
+     * One (context, question) pair to answer
+     */
+    input: Input;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: QuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
-export interface SquadExample {
-	/**
-	 * The context to be used for answering the question
-	 */
-	context: string;
-	/**
-	 * The question to be answered
-	 */
-	question: string;
-	[property: string]: unknown;
+/**
+ * One (context, question) pair to answer
+ */
+export interface Input {
+    /**
+     * The context to be used for answering the question
+     */
+    context: string;
+    /**
+     * The question to be answered
+     */
+    question: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,63 +42,63 @@ export interface SquadExample {
  * Additional inference parameters for Question Answering
  */
 export interface QuestionAnsweringParameters {
-	/**
-	 * Attempts to align the answer to real words. Improves quality on space separated
-	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
-	 */
-	alignToWords?: boolean;
-	/**
-	 * If the context is too long to fit with the question for the model, it will be split in
-	 * several chunks with some overlap. This argument controls the size of that overlap.
-	 */
-	docStride?: number;
-	/**
-	 * Whether to accept impossible as an answer.
-	 */
-	handleImpossibleAnswer?: boolean;
-	/**
-	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
-	 * considered).
-	 */
-	maxAnswerLen?: number;
-	/**
-	 * The maximum length of the question after tokenization. It will be truncated if needed.
-	 */
-	maxQuestionLen?: number;
-	/**
-	 * The maximum length of the total sentence (context + question) in tokens of each chunk
-	 * passed to the model. The context will be split in several chunks (using docStride as
-	 * overlap) if needed.
-	 */
-	maxSeqLen?: number;
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Note that we
-	 * return less than topk answers if there are not enough options available within the
-	 * context.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * Attempts to align the answer to real words. Improves quality on space separated
+     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+     */
+    alignToWords?: boolean;
+    /**
+     * If the context is too long to fit with the question for the model, it will be split in
+     * several chunks with some overlap. This argument controls the size of that overlap.
+     */
+    docStride?: number;
+    /**
+     * Whether to accept impossible as an answer.
+     */
+    handleImpossibleAnswer?: boolean;
+    /**
+     * The maximum length of predicted answers (e.g., only answers with a shorter length are
+     * considered).
+     */
+    maxAnswerLen?: number;
+    /**
+     * The maximum length of the question after tokenization. It will be truncated if needed.
+     */
+    maxQuestionLen?: number;
+    /**
+     * The maximum length of the total sentence (context + question) in tokens of each chunk
+     * passed to the model. The context will be split in several chunks (using docStride as
+     * overlap) if needed.
+     */
+    maxSeqLen?: number;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Question Answering task
  */
 export interface QuestionAnsweringOutput {
-	/**
-	 * The answer to the question.
-	 */
-	answer: string;
-	/**
-	 * The character position in the input where the answer ends.
-	 */
-	end: number;
-	/**
-	 * The probability associated to the answer.
-	 */
-	score: number;
-	/**
-	 * The character position in the input where the answer begins.
-	 */
-	start: number;
-	[property: string]: unknown;
+    /**
+     * The answer to the question.
+     */
+    answer: string;
+    /**
+     * The character position in the input where the answer ends.
+     */
+    end: number;
+    /**
+     * The probability associated to the answer.
+     */
+    score: number;
+    /**
+     * The character position in the input where the answer begins.
+     */
+    start: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
index 9eab32e13a..da38a8f8aa 100644
--- a/packages/tasks/src/tasks/question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -4,41 +4,27 @@
 	"description": "Inputs for Question Answering inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several question+context pairs to answer",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/SquadExample"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/SquadExample"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/QuestionAnsweringParameters"
-		}
-	},
-	"$defs": {
-		"SquadExample": {
-			"title": "SquadExample",
+		"input": {
+			"description": "One (context, question) pair to answer",
 			"type": "object",
 			"properties": {
-				"question": {
-					"type": "string",
-					"description": "The question to be answered"
-				},
 				"context": {
 					"type": "string",
 					"description": "The context to be used for answering the question"
+				},
+				"question": {
+					"type": "string",
+					"description": "The question to be answered"
 				}
 			},
 			"required": ["question", "context"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/QuestionAnsweringParameters"
+		}
+	},
+	"$defs": {
 		"QuestionAnsweringParameters": {
 			"title": "QuestionAnsweringParameters",
 			"description": "Additional inference parameters for Question Answering",
@@ -75,5 +61,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 252326caf1..2b0df40114 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -1,32 +1,34 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 export type SentenceSimilarityOutput = number[];
 
 /**
  * Inputs for Sentence similarity inference
  */
 export interface SentenceSimilarityInput {
-	inputs: SentenceSimilarityInputSingle[] | SentenceSimilarityInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    input: InputObject;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
-export interface SentenceSimilarityInputSingle {
-	/**
-	 * A list of strings which will be compared against the source_sentence.
-	 */
-	sentences: string[];
-	/**
-	 * The string that you wish to compare the other strings with. This can be a phrase,
-	 * sentence, or longer passage, depending on the model being used.
-	 */
-	sourceSentence: string;
-	[property: string]: unknown;
+export interface InputObject {
+    /**
+     * A list of strings which will be compared against the source_sentence.
+     */
+    sentences: string[];
+    /**
+     * The string that you wish to compare the other strings with. This can be a phrase,
+     * sentence, or longer passage, depending on the model being used.
+     */
+    sourceSentence: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
index cfb884abee..2e85543f88 100644
--- a/packages/tasks/src/tasks/sentence-similarity/spec/input.json
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -4,27 +4,8 @@
 	"description": "Inputs for Sentence similarity inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"anyOf": [
-				{
-					"$ref": "#/$defs/SentenceSimilarityInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/SentenceSimilarityInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/SentenceSimilarityParameters"
-		}
-	},
-	"$defs": {
-		"SentenceSimilarityInputSingle": {
-			"title": "SentenceSimilarityInputSingle",
+		"input": {
+			"title": "SentenceSimilarityInput",
 			"type": "object",
 			"properties": {
 				"sourceSentence": {
@@ -41,6 +22,12 @@
 			},
 			"required": ["sourceSentence", "sentences"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/SentenceSimilarityParameters"
+		}
+	},
+	"$defs": {
 		"SentenceSimilarityParameters": {
 			"title": "SentenceSimilarityParameters",
 			"description": "Additional inference parameters for Sentence Similarity",
@@ -48,5 +35,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index d38632cd4f..9a063e93cc 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Summarization inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface SummarizationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,19 +29,19 @@ export interface SummarizationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: unknown };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Text2TextGenerationTruncationStrategy;
-	[property: string]: unknown;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    generateParameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Text2TextGenerationTruncationStrategy;
+    [property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -50,9 +52,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 35b172a5da..21cc519b09 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,59 +1,61 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Table Question Answering inference
  */
 export interface TableQuestionAnsweringInput {
-	/**
-	 * One or several questions about a table
-	 */
-	inputs: Inputs;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * One (table, question) pair to answer
+     */
+    input: Input;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
- * One or several questions about a table
+ * One (table, question) pair to answer
  */
-export interface Inputs {
-	/**
-	 * One or several questions to be answered about the table
-	 */
-	question?: string[] | string;
-	/**
-	 * The table to serve as context for the questions
-	 */
-	table?: { [key: string]: unknown };
-	[property: string]: unknown;
+export interface Input {
+    /**
+     * The question to be answered about the table
+     */
+    question: string;
+    /**
+     * The table to serve as context for the questions
+     */
+    table: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Table Question Answering task
  */
 export interface TableQuestionAnsweringOutput {
-	/**
-	 * If the model has an aggregator, this returns the aggregator.
-	 */
-	aggregator?: string;
-	/**
-	 * The answer of the question given the table. If there is an aggregator, the answer will be
-	 * preceded by `AGGREGATOR >`.
-	 */
-	answer: string;
-	/**
-	 * List of strings made up of the answer cell values.
-	 */
-	cells: string[];
-	/**
-	 * Coordinates of the cells of the answers.
-	 */
-	coordinates: Array<number[]>;
-	[property: string]: unknown;
+    /**
+     * If the model has an aggregator, this returns the aggregator.
+     */
+    aggregator?: string;
+    /**
+     * The answer of the question given the table. If there is an aggregator, the answer will be
+     * preceded by `AGGREGATOR >`.
+     */
+    answer: string;
+    /**
+     * List of strings made up of the answer cell values.
+     */
+    cells: string[];
+    /**
+     * Coordinates of the cells of the answers.
+     */
+    coordinates: Array<number[]>;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index aa7c7231f8..3ceb5c07a0 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Table Question Answering inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several questions about a table",
+		"input": {
+			"description": "One (table, question) pair to answer",
 			"type": "object",
 			"properties": {
 				"table": {
@@ -13,20 +13,11 @@
 					"type": "object"
 				},
 				"question": {
-					"description": "One or several questions to be answered about the table",
-					"anyOf": [
-						{
-							"type": "string"
-						},
-						{
-							"type": "array",
-							"items": {
-								"type": "string"
-							}
-						}
-					]
+					"description": "The question to be answered about the table",
+					"type": "string"
 				}
-			}
+			},
+			"required": ["table", "question"]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -41,5 +32,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 58a54af0c4..6e09d5c7ce 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text Classification inference
  */
 export interface TextClassificationInput {
-	/**
-	 * One or several texts to classify
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TextClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The text to classify
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,15 +27,15 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: TextClassificationOutputTransform;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: TextClassificationOutputTransform;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -42,13 +44,13 @@ export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 73b14c794c..85c8468ad4 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Text Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts to classify",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The text to classify",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -52,5 +42,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 62af0a9c59..cd83abbb2e 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-	/**
-	 * The text to initialize generation with
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TextGenerationParameters;
-	[property: string]: unknown;
+    /**
+     * The text to initialize generation with
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,63 +27,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-	/**
-	 * Whether to use logit sampling (true) or greedy search (false).
-	 */
-	doSample?: boolean;
-	/**
-	 * Maximum number of generated tokens.
-	 */
-	maxNewTokens?: number;
-	/**
-	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-	 * paper](https://hf.co/papers/1909.05858) for more details.
-	 */
-	repetitionPenalty?: number;
-	/**
-	 * Whether to prepend the prompt to the generated text.
-	 */
-	returnFullText?: boolean;
-	/**
-	 * Stop generating tokens if a member of `stop_sequences` is generated.
-	 */
-	stopSequences?: string[];
-	/**
-	 * The value used to modulate the logits distribution.
-	 */
-	temperature?: number;
-	/**
-	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-	 */
-	topK?: number;
-	/**
-	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-	 * up to `top_p` or higher are kept for generation.
-	 */
-	topP?: number;
-	/**
-	 * Truncate input tokens to the given size.
-	 */
-	truncate?: number;
-	/**
-	 * Typical Decoding mass. See [Typical Decoding for Natural Language
-	 * Generation](https://hf.co/papers/2202.00666) for more information
-	 */
-	typicalP?: number;
-	/**
-	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-	 */
-	watermark?: boolean;
-	[property: string]: unknown;
+    /**
+     * Whether to use logit sampling (true) or greedy search (false).
+     */
+    doSample?: boolean;
+    /**
+     * Maximum number of generated tokens.
+     */
+    maxNewTokens?: number;
+    /**
+     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+     * paper](https://hf.co/papers/1909.05858) for more details.
+     */
+    repetitionPenalty?: number;
+    /**
+     * Whether to prepend the prompt to the generated text.
+     */
+    returnFullText?: boolean;
+    /**
+     * Stop generating tokens if a member of `stop_sequences` is generated.
+     */
+    stopSequences?: string[];
+    /**
+     * The value used to modulate the logits distribution.
+     */
+    temperature?: number;
+    /**
+     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+     */
+    topK?: number;
+    /**
+     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+     * up to `top_p` or higher are kept for generation.
+     */
+    topP?: number;
+    /**
+     * Truncate input tokens to the given size.
+     */
+    truncate?: number;
+    /**
+     * Typical Decoding mass. See [Typical Decoding for Natural Language
+     * Generation](https://hf.co/papers/2202.00666) for more information
+     */
+    typicalP?: number;
+    /**
+     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+     */
+    watermark?: boolean;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-	/**
-	 * The generated text
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index 9b5d3d08ef..b1cf45995d 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Text Generation inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "The text to initialize generation with",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -79,5 +69,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index 3b23948a36..71e745c21d 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,35 +1,37 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text To Audio inference
  */
 export interface TextToAudioInput {
-	/**
-	 * One or several texts to generate audio for
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToAudioOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	samplingRate: number;
-	[property: string]: unknown;
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    samplingRate: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index 96febb6fc1..a0802c1027 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Text To Audio inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts to generate audio for",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The input text data",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -31,5 +21,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 766b23a382..be5c01981f 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text to Speech inference
  *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
-	/**
-	 * One or several texts to generate audio for
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
 /**
@@ -27,13 +29,13 @@ export interface TextToSpeechInput {
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
-	/**
-	 * The generated audio waveform.
-	 */
-	audio: unknown;
-	/**
-	 * The sampling rate of the generated audio waveform.
-	 */
-	samplingRate: number;
-	[property: string]: unknown;
+    /**
+     * The generated audio waveform.
+     */
+    audio: unknown;
+    /**
+     * The sampling rate of the generated audio waveform.
+     */
+    samplingRate: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 7d2c7182e3..ce66234225 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Text2text Generation inference
  */
 export interface Text2TextGenerationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,19 +27,19 @@ export interface Text2TextGenerationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: unknown };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Text2TextGenerationTruncationStrategy;
-	[property: string]: unknown;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    generateParameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Text2TextGenerationTruncationStrategy;
+    [property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -46,9 +48,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index e8a0b9cd08..495b5a2817 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Text2text Generation inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or more texts to use for text2text generation",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The input text data",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -52,7 +42,7 @@
 						}
 					]
 				},
-				"Parameters": {
+				"generateParameters": {
 					"title": "generateParameters",
 					"type": "object",
 					"description": "Additional parametrization of the text generation algorithm"
@@ -60,5 +50,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index fa18ba34b8..629c47c471 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Token Classification inference
  */
 export interface TokenClassificationInput {
-	/**
-	 * One or several texts which tokens are to be classified
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: TokenClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: TokenClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,19 +27,19 @@ export interface TokenClassificationInput {
  * Additional inference parameters for Token Classification
  */
 export interface TokenClassificationParameters {
-	/**
-	 * The strategy used to fuse tokens based on model predictions
-	 */
-	aggregationStrategy?: TokenClassificationAggregationStrategy;
-	/**
-	 * A list of labels to ignore
-	 */
-	ignoreLabels?: string[];
-	/**
-	 * The number of overlapping tokens between chunks when splitting the input text.
-	 */
-	stride?: number;
-	[property: string]: unknown;
+    /**
+     * The strategy used to fuse tokens based on model predictions
+     */
+    aggregationStrategy?: TokenClassificationAggregationStrategy;
+    /**
+     * A list of labels to ignore
+     */
+    ignoreLabels?: string[];
+    /**
+     * The number of overlapping tokens between chunks when splitting the input text.
+     */
+    stride?: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -60,26 +62,26 @@ export type TokenClassificationAggregationStrategy = "none" | "simple" | "first"
  * Outputs of inference for the Token Classification task
  */
 export interface TokenClassificationOutput {
-	/**
-	 * The character position in the input where this group ends.
-	 */
-	end?: number;
-	/**
-	 * The predicted label for that group of tokens
-	 */
-	entityGroup?: string;
-	label: unknown;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	/**
-	 * The character position in the input where this group begins.
-	 */
-	start?: number;
-	/**
-	 * The corresponding text
-	 */
-	word?: string;
-	[property: string]: unknown;
+    /**
+     * The character position in the input where this group ends.
+     */
+    end?: number;
+    /**
+     * The predicted label for that group of tokens
+     */
+    entityGroup?: string;
+    label:        unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    /**
+     * The character position in the input where this group begins.
+     */
+    start?: number;
+    /**
+     * The corresponding text
+     */
+    word?: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index a2fcf5fdf2..f46b20cf9a 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for Token Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several texts which tokens are to be classified",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+		"input": {
+			"description": "The input text data",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -70,5 +60,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index ecb1082879..96090808be 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -1,24 +1,26 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Translation inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface TranslationInput {
-	/**
-	 * One or more texts to use for text2text generation
-	 */
-	inputs: string[] | string;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: Text2TextGenerationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data
+     */
+    input: string;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: Text2TextGenerationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -27,19 +29,19 @@ export interface TranslationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-	/**
-	 * Whether to clean up the potential extra spaces in the text output.
-	 */
-	cleanUpTokenizationSpaces?: boolean;
-	/**
-	 * Additional parametrization of the text generation algorithm
-	 */
-	parameters?: { [key: string]: unknown };
-	/**
-	 * The truncation strategy to use
-	 */
-	truncation?: Text2TextGenerationTruncationStrategy;
-	[property: string]: unknown;
+    /**
+     * Whether to clean up the potential extra spaces in the text output.
+     */
+    cleanUpTokenizationSpaces?: boolean;
+    /**
+     * Additional parametrization of the text generation algorithm
+     */
+    generateParameters?: { [key: string]: unknown };
+    /**
+     * The truncation strategy to use
+     */
+    truncation?: Text2TextGenerationTruncationStrategy;
+    [property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -50,9 +52,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-	/**
-	 * The generated text.
-	 */
-	generatedText: string;
-	[property: string]: unknown;
+    /**
+     * The generated text.
+     */
+    generatedText: string;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index 29b3fed095..2d258b33be 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,22 +1,24 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Video Classification inference
  */
 export interface VideoClassificationInput {
-	/**
-	 * One or several videos to be classified
-	 */
-	inputs: unknown;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: VideoClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * One or several videos to be classified
+     */
+    input: unknown;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VideoClassificationParameters;
+    [property: string]: unknown;
 }
 
 /**
@@ -25,32 +27,38 @@ export interface VideoClassificationInput {
  * Additional inference parameters for Video Classification
  */
 export interface VideoClassificationParameters {
-	/**
-	 * The sampling rate used to select frames from the video.
-	 */
-	frameSamplingRate?: number;
-	/**
-	 * The number of sampled frames to consider for classification.
-	 */
-	numFrames?: number;
-	/**
-	 * When specified, limits the output to the top K most probable classes.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The sampling rate used to select frames from the video.
+     */
+    frameSamplingRate?: number;
+    /**
+     * The function to apply to the model outputs in order to retrieve the scores.
+     */
+    functionToApply?: TextClassificationOutputTransform;
+    /**
+     * The number of sampled frames to consider for classification.
+     */
+    numFrames?: number;
+    /**
+     * When specified, limits the output to the top K most probable classes.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
+export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+
 /**
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutput {
-	/**
-	 * The predicted class label (model specific).
-	 */
-	label: string;
-	/**
-	 * The corresponding probability.
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted class label (model specific).
+     */
+    label: string;
+    /**
+     * The corresponding probability.
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 91b9f9642e..796ce393fa 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Video Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
+		"input": {
 			"description": "One or several videos to be classified"
 		},
 		"parameters": {
@@ -18,6 +18,22 @@
 			"description": "Additional inference parameters for Video Classification",
 			"type": "object",
 			"properties": {
+				"functionToApply": {
+					"title": "TextClassificationOutputTransform",
+					"type": "string",
+					"description": "The function to apply to the model outputs in order to retrieve the scores.",
+					"oneOf": [
+						{
+							"const": "sigmoid"
+						},
+						{
+							"const": "softmax"
+						},
+						{
+							"const": "none"
+						}
+					]
+				},
 				"numFrames": {
 					"type": "integer",
 					"description": "The number of sampled frames to consider for classification."
@@ -33,5 +49,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 3b9070447a..7d192a33d0 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Visual Question Answering inference
  */
 export interface VisualQuestionAnsweringInput {
-	/**
-	 * One or more image-question pairs
-	 */
-	inputs: VisualQuestionAnsweringInputSingle[] | VisualQuestionAnsweringInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: VisualQuestionAnsweringParameters;
-	[property: string]: unknown;
+    /**
+     * One (image, question) pair to answer
+     */
+    input: VisualQuestionAnsweringInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: VisualQuestionAnsweringParameters;
+    [property: string]: unknown;
 }
 
+/**
+ * One (image, question) pair to answer
+ */
 export interface VisualQuestionAnsweringInputSingle {
-	/**
-	 * The image.
-	 */
-	image: unknown;
-	/**
-	 * The question to answer based on the image.
-	 */
-	question: unknown;
-	[property: string]: unknown;
+    /**
+     * The image.
+     */
+    image: unknown;
+    /**
+     * The question to answer based on the image.
+     */
+    question: unknown;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,27 +42,27 @@ export interface VisualQuestionAnsweringInputSingle {
  * Additional inference parameters for Visual Question Answering
  */
 export interface VisualQuestionAnsweringParameters {
-	/**
-	 * The number of answers to return (will be chosen by order of likelihood). Note that we
-	 * return less than topk answers if there are not enough options available within the
-	 * context.
-	 */
-	topK?: number;
-	[property: string]: unknown;
+    /**
+     * The number of answers to return (will be chosen by order of likelihood). Note that we
+     * return less than topk answers if there are not enough options available within the
+     * context.
+     */
+    topK?: number;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Visual Question Answering task
  */
 export interface VisualQuestionAnsweringOutput {
-	/**
-	 * The answer to the question
-	 */
-	answer?: string;
-	label: unknown;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The answer to the question
+     */
+    answer?: string;
+    label:   unknown;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index cc6e5d93ab..2e77422d9d 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -4,27 +4,8 @@
 	"description": "Inputs for Visual Question Answering inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or more image-question pairs",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/VisualQuestionAnsweringInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/VisualQuestionAnsweringInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/VisualQuestionAnsweringParameters"
-		}
-	},
-	"$defs": {
-		"VisualQuestionAnsweringInputSingle": {
+		"input": {
+			"description": "One (image, question) pair to answer",
 			"type": "object",
 			"title": "VisualQuestionAnsweringInputSingle",
 			"properties": {
@@ -37,6 +18,12 @@
 			},
 			"required": ["question", "image"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/VisualQuestionAnsweringParameters"
+		}
+	},
+	"$defs": {
 		"VisualQuestionAnsweringParameters": {
 			"title": "VisualQuestionAnsweringParameters",
 			"description": "Additional inference parameters for Visual Question Answering",
@@ -49,5 +36,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 564c6ba629..578f24946d 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Classification inference
  */
 export interface ZeroShotClassificationInput {
-	/**
-	 * One or several text + candidate labels pairs to classify
-	 */
-	inputs: ZeroShotClassificationInputSingle[] | ZeroShotClassificationInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ZeroShotClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input text data, with candidate labels
+     */
+    input: InputObject;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotClassificationParameters;
+    [property: string]: unknown;
 }
 
-export interface ZeroShotClassificationInputSingle {
-	/**
-	 * The set of possible class labels to classify the text into.
-	 */
-	candidateLabels: string[];
-	/**
-	 * The text to classify
-	 */
-	text: string;
-	[property: string]: unknown;
+/**
+ * The input text data, with candidate labels
+ */
+export interface InputObject {
+    /**
+     * The set of possible class labels to classify the text into.
+     */
+    candidateLabels: string[];
+    /**
+     * The text to classify
+     */
+    text: string;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,31 +42,31 @@ export interface ZeroShotClassificationInputSingle {
  * Additional inference parameters for Zero Shot Classification
  */
 export interface ZeroShotClassificationParameters {
-	/**
-	 * The sentence used in conjunction with candidateLabels to attempt the text classification
-	 * by replacing the placeholder with the candidate labels.
-	 */
-	hypothesisTemplate?: string;
-	/**
-	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
-	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
-	 * considered independent and probabilities are normalized for each candidate.
-	 */
-	multiLabel?: boolean;
-	[property: string]: unknown;
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesisTemplate?: string;
+    /**
+     * Whether multiple candidate labels can be true. If false, the scores are normalized such
+     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+     * considered independent and probabilities are normalized for each candidate.
+     */
+    multiLabel?: boolean;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
 export interface ZeroShotClassificationOutput {
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
index e573f68179..ce10d0b616 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -4,29 +4,10 @@
 	"description": "Inputs for Zero Shot Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several text + candidate labels pairs to classify",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/ZeroShotClassificationInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/ZeroShotClassificationInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/ZeroShotClassificationParameters"
-		}
-	},
-	"$defs": {
-		"ZeroShotClassificationInputSingle": {
+		"input": {
+			"description": "The input text data, with candidate labels",
 			"type": "object",
-			"title": "ZeroShotClassificationInputSingle",
+			"title": "ZeroShotClassificationInput",
 			"properties": {
 				"text": {
 					"type": "string",
@@ -42,6 +23,12 @@
 			},
 			"required": ["text", "candidateLabels"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotClassificationParameters"
+		}
+	},
+	"$defs": {
 		"ZeroShotClassificationParameters": {
 			"title": "ZeroShotClassificationParameters",
 			"description": "Additional inference parameters for Zero Shot Classification",
@@ -58,5 +45,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 0976094a45..970d7708b9 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,34 +1,39 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Image Classification inference
  */
 export interface ZeroShotImageClassificationInput {
-	/**
-	 * One or several images to classify
-	 */
-	inputs: ZeroShotImageClassificationInputSingle[] | ZeroShotImageClassificationInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: ZeroShotImageClassificationParameters;
-	[property: string]: unknown;
+    /**
+     * The input image data, with candidate labels
+     */
+    input: ZeroShotImageClassificationInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: ZeroShotImageClassificationParameters;
+    [property: string]: unknown;
 }
 
+/**
+ * The input image data, with candidate labels
+ */
 export interface ZeroShotImageClassificationInputSingle {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to classify
-	 */
-	image: unknown;
-	[property: string]: unknown;
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to classify
+     */
+    image: unknown;
+    [property: string]: unknown;
 }
 
 /**
@@ -37,25 +42,25 @@ export interface ZeroShotImageClassificationInputSingle {
  * Additional inference parameters for Zero Shot Image Classification
  */
 export interface ZeroShotImageClassificationParameters {
-	/**
-	 * The sentence used in conjunction with candidateLabels to attempt the text classification
-	 * by replacing the placeholder with the candidate labels.
-	 */
-	hypothesisTemplate?: string;
-	[property: string]: unknown;
+    /**
+     * The sentence used in conjunction with candidateLabels to attempt the text classification
+     * by replacing the placeholder with the candidate labels.
+     */
+    hypothesisTemplate?: string;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
 export interface ZeroShotImageClassificationOutput {
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 029b19b2dc..07fffa5efe 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -4,27 +4,8 @@
 	"description": "Inputs for Zero Shot Image Classification inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several images to classify",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/ZeroShotImageClassificationInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/ZeroShotImageClassificationInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/ZeroShotImageClassificationParameters"
-		}
-	},
-	"$defs": {
-		"ZeroShotImageClassificationInputSingle": {
+		"input": {
+			"description": "The input image data, with candidate labels",
 			"type": "object",
 			"title": "ZeroShotImageClassificationInputSingle",
 			"properties": {
@@ -41,6 +22,12 @@
 			},
 			"required": ["image", "candidateLabels"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotImageClassificationParameters"
+		}
+	},
+	"$defs": {
 		"ZeroShotImageClassificationParameters": {
 			"title": "ZeroShotImageClassificationParameters",
 			"description": "Additional inference parameters for Zero Shot Image Classification",
@@ -53,5 +40,5 @@
 			}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 6493541d82..2e3e12f743 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,54 +1,59 @@
+
 /**
  * Inference code generated from the JSON schema spec in ./spec
- *
+ * 
  * Using src/scripts/inference-codegen
  */
 
+
 /**
  * Inputs for Zero Shot Object Detection inference
  */
 export interface ZeroShotObjectDetectionInput {
-	/**
-	 * One or several images to perform object detection on
-	 */
-	inputs: ZeroShotObjectDetectionInputSingle[] | ZeroShotObjectDetectionInputSingle;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
+    /**
+     * The input image data, with candidate labels
+     */
+    input: ZeroShotObjectDetectionInputSingle;
+    /**
+     * Additional inference parameters
+     */
+    parameters?: { [key: string]: unknown };
+    [property: string]: unknown;
 }
 
+/**
+ * The input image data, with candidate labels
+ */
 export interface ZeroShotObjectDetectionInputSingle {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to generate bounding boxes from
-	 */
-	image: unknown;
-	[property: string]: unknown;
+    /**
+     * The candidate labels for this image
+     */
+    candidateLabels: string[];
+    /**
+     * The image data to generate bounding boxes from
+     */
+    image: unknown;
+    [property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Object Detection task
  */
 export interface ZeroShotObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: BoundingBox;
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
+    /**
+     * The predicted bounding box. Coordinates are relative to the top left corner of the input
+     * image.
+     */
+    box: BoundingBox;
+    /**
+     * A candidate label
+     */
+    label: string;
+    /**
+     * The associated score / probability
+     */
+    score: number;
+    [property: string]: unknown;
 }
 
 /**
@@ -56,9 +61,9 @@ export interface ZeroShotObjectDetectionOutput {
  * image.
  */
 export interface BoundingBox {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: unknown;
+    xmax: number;
+    xmin: number;
+    ymax: number;
+    ymin: number;
+    [property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index f2929226b1..7f72f3f9bb 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -4,27 +4,8 @@
 	"description": "Inputs for Zero Shot Object Detection inference",
 	"type": "object",
 	"properties": {
-		"inputs": {
-			"description": "One or several images to perform object detection on",
-			"anyOf": [
-				{
-					"$ref": "#/$defs/ZeroShotObjectDetectionInputSingle"
-				},
-				{
-					"type": "array",
-					"items": {
-						"$ref": "#/$defs/ZeroShotObjectDetectionInputSingle"
-					}
-				}
-			]
-		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/ZeroShotObjectDetectionParameters"
-		}
-	},
-	"$defs": {
-		"ZeroShotObjectDetectionInputSingle": {
+		"input": {
+			"description": "The input image data, with candidate labels",
 			"type": "object",
 			"title": "ZeroShotObjectDetectionInputSingle",
 			"properties": {
@@ -41,6 +22,12 @@
 			},
 			"required": ["image", "candidateLabels"]
 		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotObjectDetectionParameters"
+		}
+	},
+	"$defs": {
 		"ZeroShotObjectDetectionParameters": {
 			"title": "ZeroShotObjectDetectionParameters",
 			"description": "Additional inference parameters for Zero Shot Object Detection",
@@ -48,5 +35,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["inputs"]
+	"required": ["input"]
 }

From 29f5975a715f519c50c42473f1c2fbeea212f9da Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 25 Jan 2024 11:35:25 +0100
Subject: [PATCH 31/51] rename input -> data

---
 .../tasks/audio-classification/inference.ts   |  58 ++++---
 .../audio-classification/spec/input.json      |   4 +-
 .../automatic-speech-recognition/inference.ts |  32 ++--
 .../spec/input.json                           |   4 +-
 .../src/tasks/depth-estimation/inference.ts   |  32 ++--
 .../tasks/depth-estimation/spec/input.json    |   4 +-
 .../document-question-answering/inference.ts  | 152 +++++++++---------
 .../spec/input.json                           |  31 ++--
 .../src/tasks/feature-extraction/inference.ts |  22 ++-
 .../tasks/feature-extraction/spec/input.json  |   4 +-
 .../tasks/src/tasks/fill-mask/inference.ts    |  80 +++++----
 .../tasks/src/tasks/fill-mask/spec/input.json |   4 +-
 .../tasks/image-classification/inference.ts   |  58 ++++---
 .../image-classification/spec/input.json      |   4 +-
 .../src/tasks/image-segmentation/inference.ts |  82 +++++-----
 .../tasks/image-segmentation/spec/input.json  |   4 +-
 .../src/tasks/image-to-image/inference.ts     |  22 ++-
 .../src/tasks/image-to-image/spec/input.json  |   4 +-
 .../src/tasks/image-to-text/inference.ts      |  42 +++--
 .../src/tasks/image-to-text/spec/input.json   |   4 +-
 .../src/tasks/object-detection/inference.ts   |  70 ++++----
 .../tasks/object-detection/spec/input.json    |   4 +-
 .../src/tasks/placeholder/spec/input.json     |  16 +-
 .../src/tasks/question-answering/inference.ts | 148 +++++++++--------
 .../tasks/question-answering/spec/input.json  |   5 +-
 .../tasks/sentence-similarity/inference.ts    |  38 +++--
 .../tasks/sentence-similarity/spec/input.json |   6 +-
 .../sentence-similarity/spec/output.json      |   3 +-
 .../src/tasks/summarization/inference.ts      |  58 ++++---
 .../table-question-answering/inference.ts     |  78 +++++----
 .../table-question-answering/spec/input.json  |   5 +-
 .../tasks/text-classification/inference.ts    |  58 ++++---
 .../tasks/text-classification/spec/input.json |   4 +-
 .../src/tasks/text-generation/inference.ts    | 128 ++++++++-------
 .../src/tasks/text-generation/spec/input.json |   4 +-
 .../src/tasks/text-to-audio/inference.ts      |  40 +++--
 .../src/tasks/text-to-audio/spec/input.json   |   4 +-
 .../src/tasks/text-to-speech/inference.ts     |  40 +++--
 .../tasks/text2text-generation/inference.ts   |  58 ++++---
 .../text2text-generation/spec/input.json      |   4 +-
 .../tasks/token-classification/inference.ts   |  92 ++++++-----
 .../token-classification/spec/input.json      |   4 +-
 .../tasks/src/tasks/translation/inference.ts  |  58 ++++---
 .../tasks/video-classification/inference.ts   |  74 +++++----
 .../video-classification/spec/input.json      |   6 +-
 .../visual-question-answering/inference.ts    |  76 +++++----
 .../visual-question-answering/spec/input.json |   6 +-
 .../zero-shot-classification/inference.ts     |  84 +++++-----
 .../zero-shot-classification/spec/input.json  |   6 +-
 .../inference.ts                              |  72 ++++-----
 .../spec/input.json                           |   6 +-
 .../zero-shot-object-detection/inference.ts   |  80 +++++----
 .../spec/input.json                           |   6 +-
 53 files changed, 962 insertions(+), 1026 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 1ba43fae10..bfc7af54ec 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Audio Classification inference
  */
 export interface AudioClassificationInput {
-    /**
-     * The input audio data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: AudioClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input audio data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: AudioClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,15 +25,15 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: AudioClassificationOutputTransform;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: AudioClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -44,13 +42,13 @@ export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs for Audio Classification inference
  */
 export interface AudioClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 60062756cb..d6cc4516c6 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Audio Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input audio data"
 		},
 		"parameters": {
@@ -41,5 +41,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 7ddfe0055e..bf594e048b 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -1,33 +1,31 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Automatic Speech Recognition inference
  */
 export interface AutomaticSpeechRecognitionInput {
-    /**
-     * The input audio data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input audio data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutput {
-    /**
-     * The recognized text.
-     */
-    text: string;
-    [property: string]: unknown;
+	/**
+	 * The recognized text.
+	 */
+	text: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index 147851e579..be24719663 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Automatic Speech Recognition inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input audio data"
 		},
 		"parameters": {
@@ -20,5 +20,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
index 9019043eb9..ca831fdb41 100644
--- a/packages/tasks/src/tasks/depth-estimation/inference.ts
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type DepthEstimationOutput = unknown[];
 
 /**
  * Inputs for Depth Estimation inference
  */
 export interface DepthEstimationInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: DepthEstimationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DepthEstimationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,9 +27,9 @@ export interface DepthEstimationInput {
  * Additional inference parameters for Depth Estimation
  */
 export interface DepthEstimationParameters {
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
index 80f4919746..e5553f126e 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/input.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Depth Estimation inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index a04b3b39b5..73eb58f1c4 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Document Question Answering inference
  */
 export interface DocumentQuestionAnsweringInput {
-    /**
-     * One (document, question) pair to answer
-     */
-    input: DocumentQuestionAnsweringInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: DocumentQuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One (document, question) pair to answer
+	 */
+	data: DocumentQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DocumentQuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 /**
  * One (document, question) pair to answer
  */
-export interface DocumentQuestionAnsweringInputSingle {
-    /**
-     * The image on which the question is asked
-     */
-    image: unknown;
-    /**
-     * A question to ask of the document
-     */
-    question: string;
-    [property: string]: unknown;
+export interface DocumentQuestionAnsweringInputData {
+	/**
+	 * The image on which the question is asked
+	 */
+	image: unknown;
+	/**
+	 * A question to ask of the document
+	 */
+	question: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,65 +40,65 @@ export interface DocumentQuestionAnsweringInputSingle {
  * Additional inference parameters for Document Question Answering
  */
 export interface DocumentQuestionAnsweringParameters {
-    /**
-     * If the words in the document are too long to fit with the question for the model, it will
-     * be split in several chunks with some overlap. This argument controls the size of that
-     * overlap.
-     */
-    docStride?: number;
-    /**
-     * Whether to accept impossible as an answer
-     */
-    handleImpossibleAnswer?: boolean;
-    /**
-     * Language to use while running OCR. Defaults to english.
-     */
-    lang?: string;
-    /**
-     * The maximum length of predicted answers (e.g., only answers with a shorter length are
-     * considered).
-     */
-    maxAnswerLen?: number;
-    /**
-     * The maximum length of the question after tokenization. It will be truncated if needed.
-     */
-    maxQuestionLen?: number;
-    /**
-     * The maximum length of the total sentence (context + question) in tokens of each chunk
-     * passed to the model. The context will be split in several chunks (using doc_stride as
-     * overlap) if needed.
-     */
-    maxSeqLen?: number;
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Can return less
-     * than top_k answers if there are not enough options available within the context.
-     */
-    topK?: number;
-    /**
-     * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
-     * skip the OCR step and use the provided bounding boxes instead.
-     */
-    wordBoxes?: Array<number[] | string>;
-    [property: string]: unknown;
+	/**
+	 * If the words in the document are too long to fit with the question for the model, it will
+	 * be split in several chunks with some overlap. This argument controls the size of that
+	 * overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * Language to use while running OCR. Defaults to english.
+	 */
+	lang?: string;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using doc_stride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Can return less
+	 * than top_k answers if there are not enough options available within the context.
+	 */
+	topK?: number;
+	/**
+	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+	 * skip the OCR step and use the provided bounding boxes instead.
+	 */
+	wordBoxes?: Array<number[] | string>;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Document Question Answering task
  */
 export interface DocumentQuestionAnsweringOutput {
-    /**
-     * The answer to the question.
-     */
-    answer: string;
-    end:    number;
-    /**
-     * The probability associated to the answer.
-     */
-    score: number;
-    start: number;
-    /**
-     * The index of each word/box pair that is in the answer
-     */
-    words: number[];
-    [property: string]: unknown;
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	start: number;
+	/**
+	 * The index of each word/box pair that is in the answer
+	 */
+	words: number[];
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 580a95c92b..2161614c4d 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -4,9 +4,20 @@
 	"description": "Inputs for Document Question Answering inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "One (document, question) pair to answer",
-			"$ref": "#/$defs/DocumentQuestionAnsweringInputSingle"
+			"type": "object",
+			"title": "DocumentQuestionAnsweringInputData",
+			"properties": {
+				"image": {
+					"description": "The image on which the question is asked"
+				},
+				"question": {
+					"type": "string",
+					"description": "A question to ask of the document"
+				}
+			},
+			"required": ["image", "question"]
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -67,21 +78,7 @@
 					}
 				}
 			}
-		},
-		"DocumentQuestionAnsweringInputSingle": {
-			"type": "object",
-			"title": "DocumentQuestionAnsweringInputSingle",
-			"properties": {
-				"image": {
-					"description": "The image on which the question is asked"
-				},
-				"question": {
-					"type": "string",
-					"description": "A question to ask of the document"
-				}
-			},
-			"required": ["image", "question"]
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index d8674f516d..664b8fa331 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -1,22 +1,20 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Embedding inference
  */
 export interface FeatureExtractionInput {
-    /**
-     * The text to get the embeddings of
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The text to get the embeddings of
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index e2eadc4e91..0170a70cda 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text Embedding inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The text to get the embeddings of",
 			"type": "string"
 		},
@@ -21,5 +21,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index e01feec2ff..c51ba8ec96 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Fill Mask inference
  */
 export interface FillMaskInput {
-    /**
-     * The text with masked tokens
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: FillMaskParameters;
-    [property: string]: unknown;
+	/**
+	 * The text with masked tokens
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: FillMaskParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,39 +25,39 @@ export interface FillMaskInput {
  * Additional inference parameters for Fill Mask
  */
 export interface FillMaskParameters {
-    /**
-     * When passed, the model will limit the scores to the passed targets instead of looking up
-     * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
-     * tokenized and the first resulting token will be used (with a warning, and that might be
-     * slower).
-     */
-    targets?: string[];
-    /**
-     * When passed, overrides the number of predictions to return.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * When passed, the model will limit the scores to the passed targets instead of looking up
+	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+	 * tokenized and the first resulting token will be used (with a warning, and that might be
+	 * slower).
+	 */
+	targets?: string[];
+	/**
+	 * When passed, overrides the number of predictions to return.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Fill Mask task
  */
 export interface FillMaskOutput {
-    /**
-     * The corresponding probability
-     */
-    score: number;
-    /**
-     * The corresponding input with the mask token prediction.
-     */
-    sequence: string;
-    /**
-     * The predicted token id (to replace the masked one).
-     */
-    token: number;
-    /**
-     * The predicted token (to replace the masked one).
-     */
-    tokenStr: string;
-    [property: string]: unknown;
+	/**
+	 * The corresponding probability
+	 */
+	score: number;
+	/**
+	 * The corresponding input with the mask token prediction.
+	 */
+	sequence: string;
+	/**
+	 * The predicted token id (to replace the masked one).
+	 */
+	token: number;
+	/**
+	 * The predicted token (to replace the masked one).
+	 */
+	tokenStr: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
index 62f935fe2d..0174dbd5b4 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/input.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Fill Mask inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The text with masked tokens",
 			"type": "string"
 		},
@@ -33,5 +33,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 4885312553..de10f4731e 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image Classification inference
  */
 export interface ImageClassificationInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,15 +25,15 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: ImageClassificationOutputTransform;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: ImageClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -44,13 +42,13 @@ export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index fec2d9aa00..00c6e8b9f7 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Image Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -41,5 +41,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index ba31379ef0..366c998f33 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image Segmentation inference
  */
 export interface ImageSegmentationInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageSegmentationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageSegmentationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,23 +25,23 @@ export interface ImageSegmentationInput {
  * Additional inference parameters for Image Segmentation
  */
 export interface ImageSegmentationParameters {
-    /**
-     * Threshold to use when turning the predicted masks into binary values.
-     */
-    maskThreshold?: number;
-    /**
-     * Mask overlap threshold to eliminate small, disconnected segments.
-     */
-    overlapMaskAreaThreshold?: number;
-    /**
-     * Segmentation task to be performed, depending on model capabilities.
-     */
-    subtask?: ImageSegmentationSubtask;
-    /**
-     * Probability threshold to filter out predicted masks.
-     */
-    threshold?: number;
-    [property: string]: unknown;
+	/**
+	 * Threshold to use when turning the predicted masks into binary values.
+	 */
+	maskThreshold?: number;
+	/**
+	 * Mask overlap threshold to eliminate small, disconnected segments.
+	 */
+	overlapMaskAreaThreshold?: number;
+	/**
+	 * Segmentation task to be performed, depending on model capabilities.
+	 */
+	subtask?: ImageSegmentationSubtask;
+	/**
+	 * Probability threshold to filter out predicted masks.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
 }
 
 export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
@@ -54,17 +52,17 @@ export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
  * A predicted mask / segment
  */
 export interface ImageSegmentationOutput {
-    /**
-     * The label of the predicted segment
-     */
-    label: string;
-    /**
-     * The corresponding mask as a black-and-white image
-     */
-    mask: unknown;
-    /**
-     * The score or confidence degreee the model has
-     */
-    score?: number;
-    [property: string]: unknown;
+	/**
+	 * The label of the predicted segment
+	 */
+	label: string;
+	/**
+	 * The corresponding mask as a black-and-white image
+	 */
+	mask: unknown;
+	/**
+	 * The score or confidence degreee the model has
+	 */
+	score?: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index 5e2a115e33..cb0c8dd18c 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Image Segmentation inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -49,5 +49,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index 6ee8d47ba1..f05e24b6e4 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type ImageToImageOutput = unknown[];
 
 /**
  * Inputs for Image To Image inference
  */
 export interface ImageToImageInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: unknown;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: unknown;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index 61653d3c3e..f95e74d3da 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Image To Image inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -18,5 +18,5 @@
 			"description": "Additional inference parameters for Image To Image"
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index d4430b5c56..210b2d8788 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Image To Text inference
  */
 export interface ImageToTextInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ImageToTextParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToTextParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,20 +25,20 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
-    /**
-     * The amount of maximum tokens to generate.
-     */
-    maxNewTokens?: number;
-    [property: string]: unknown;
+	/**
+	 * The amount of maximum tokens to generate.
+	 */
+	maxNewTokens?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Image To Text task
  */
 export interface ImageToTextOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index 0ae6331c55..a49b445fed 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Image To Text inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index 0d38adb58a..f432d2cba5 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Object Detection inference
  */
 export interface ObjectDetectionInput {
-    /**
-     * The input image data
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ObjectDetectionParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ObjectDetectionParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,31 +25,31 @@ export interface ObjectDetectionInput {
  * Additional inference parameters for Object Detection
  */
 export interface ObjectDetectionParameters {
-    /**
-     * The probability necessary to make a prediction.
-     */
-    threshold?: number;
-    [property: string]: unknown;
+	/**
+	 * The probability necessary to make a prediction.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Object Detection task
  */
 export interface ObjectDetectionOutput {
-    /**
-     * The predicted bounding box. Coordinates are relative to the top left corner of the input
-     * image.
-     */
-    box: BoundingBox;
-    /**
-     * The predicted label for the bounding box
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * The predicted label for the bounding box
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -59,9 +57,9 @@ export interface ObjectDetectionOutput {
  * image.
  */
 export interface BoundingBox {
-    xmax: number;
-    xmin: number;
-    ymax: number;
-    ymin: number;
-    [property: string]: unknown;
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index 5055f2e17f..8593df43c3 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Object Detection inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data"
 		},
 		"parameters": {
@@ -25,5 +25,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
index 8dc1c32613..eb8b9b50bd 100644
--- a/packages/tasks/src/tasks/placeholder/spec/input.json
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -4,19 +4,9 @@
 	"description": "Inputs for <TASK_ID> inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "TODO: describe the input here. This must be model & framework agnostic.",
-			"anyOf": [
-				{
-					"type": "string"
-				},
-				{
-					"type": "array",
-					"items": {
-						"type": "string"
-					}
-				}
-			]
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -40,5 +30,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 7c3dd476a2..1895b1dd49 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Question Answering inference
  */
 export interface QuestionAnsweringInput {
-    /**
-     * One (context, question) pair to answer
-     */
-    input: Input;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: QuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One (context, question) pair to answer
+	 */
+	data: QuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: QuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 /**
  * One (context, question) pair to answer
  */
-export interface Input {
-    /**
-     * The context to be used for answering the question
-     */
-    context: string;
-    /**
-     * The question to be answered
-     */
-    question: string;
-    [property: string]: unknown;
+export interface QuestionAnsweringInputData {
+	/**
+	 * The context to be used for answering the question
+	 */
+	context: string;
+	/**
+	 * The question to be answered
+	 */
+	question: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,63 +40,63 @@ export interface Input {
  * Additional inference parameters for Question Answering
  */
 export interface QuestionAnsweringParameters {
-    /**
-     * Attempts to align the answer to real words. Improves quality on space separated
-     * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
-     */
-    alignToWords?: boolean;
-    /**
-     * If the context is too long to fit with the question for the model, it will be split in
-     * several chunks with some overlap. This argument controls the size of that overlap.
-     */
-    docStride?: number;
-    /**
-     * Whether to accept impossible as an answer.
-     */
-    handleImpossibleAnswer?: boolean;
-    /**
-     * The maximum length of predicted answers (e.g., only answers with a shorter length are
-     * considered).
-     */
-    maxAnswerLen?: number;
-    /**
-     * The maximum length of the question after tokenization. It will be truncated if needed.
-     */
-    maxQuestionLen?: number;
-    /**
-     * The maximum length of the total sentence (context + question) in tokens of each chunk
-     * passed to the model. The context will be split in several chunks (using docStride as
-     * overlap) if needed.
-     */
-    maxSeqLen?: number;
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Note that we
-     * return less than topk answers if there are not enough options available within the
-     * context.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * Attempts to align the answer to real words. Improves quality on space separated
+	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+	 */
+	alignToWords?: boolean;
+	/**
+	 * If the context is too long to fit with the question for the model, it will be split in
+	 * several chunks with some overlap. This argument controls the size of that overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer.
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using docStride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Question Answering task
  */
 export interface QuestionAnsweringOutput {
-    /**
-     * The answer to the question.
-     */
-    answer: string;
-    /**
-     * The character position in the input where the answer ends.
-     */
-    end: number;
-    /**
-     * The probability associated to the answer.
-     */
-    score: number;
-    /**
-     * The character position in the input where the answer begins.
-     */
-    start: number;
-    [property: string]: unknown;
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	/**
+	 * The character position in the input where the answer ends.
+	 */
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	/**
+	 * The character position in the input where the answer begins.
+	 */
+	start: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
index da38a8f8aa..92484661bb 100644
--- a/packages/tasks/src/tasks/question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -4,7 +4,8 @@
 	"description": "Inputs for Question Answering inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
+			"title": "QuestionAnsweringInputData",
 			"description": "One (context, question) pair to answer",
 			"type": "object",
 			"properties": {
@@ -61,5 +62,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
index 2b0df40114..f1b72447d5 100644
--- a/packages/tasks/src/tasks/sentence-similarity/inference.ts
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -1,34 +1,32 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 export type SentenceSimilarityOutput = number[];
 
 /**
  * Inputs for Sentence similarity inference
  */
 export interface SentenceSimilarityInput {
-    input: InputObject;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	data: SentenceSimilarityInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
-export interface InputObject {
-    /**
-     * A list of strings which will be compared against the source_sentence.
-     */
-    sentences: string[];
-    /**
-     * The string that you wish to compare the other strings with. This can be a phrase,
-     * sentence, or longer passage, depending on the model being used.
-     */
-    sourceSentence: string;
-    [property: string]: unknown;
+export interface SentenceSimilarityInputData {
+	/**
+	 * A list of strings which will be compared against the source_sentence.
+	 */
+	sentences: string[];
+	/**
+	 * The string that you wish to compare the other strings with. This can be a phrase,
+	 * sentence, or longer passage, depending on the model being used.
+	 */
+	sourceSentence: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
index 2e85543f88..1141781e0d 100644
--- a/packages/tasks/src/tasks/sentence-similarity/spec/input.json
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Sentence similarity inference",
 	"type": "object",
 	"properties": {
-		"input": {
-			"title": "SentenceSimilarityInput",
+		"data": {
+			"title": "SentenceSimilarityInputData",
 			"type": "object",
 			"properties": {
 				"sourceSentence": {
@@ -35,5 +35,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/output.json b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
index e1fc1c9acb..ca13d98bd5 100644
--- a/packages/tasks/src/tasks/sentence-similarity/spec/output.json
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
@@ -6,6 +6,7 @@
 	"type": "array",
 	"items": {
 		"description": "The associated similarity score for each of the given sentences",
-		"type": "number"
+		"type": "number",
+		"title": "SentenceSimilarityScore"
 	}
 }
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
index 9a063e93cc..16d30cf7a1 100644
--- a/packages/tasks/src/tasks/summarization/inference.ts
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Summarization inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface SummarizationInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,19 +27,19 @@ export interface SummarizationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    generateParameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Text2TextGenerationTruncationStrategy;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -52,9 +50,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface SummarizationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 21cc519b09..836aab94df 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -1,61 +1,59 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Table Question Answering inference
  */
 export interface TableQuestionAnsweringInput {
-    /**
-     * One (table, question) pair to answer
-     */
-    input: Input;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * One (table, question) pair to answer
+	 */
+	data: TableQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * One (table, question) pair to answer
  */
-export interface Input {
-    /**
-     * The question to be answered about the table
-     */
-    question: string;
-    /**
-     * The table to serve as context for the questions
-     */
-    table: { [key: string]: unknown };
-    [property: string]: unknown;
+export interface TableQuestionAnsweringInputData {
+	/**
+	 * The question to be answered about the table
+	 */
+	question: string;
+	/**
+	 * The table to serve as context for the questions
+	 */
+	table: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Table Question Answering task
  */
 export interface TableQuestionAnsweringOutput {
-    /**
-     * If the model has an aggregator, this returns the aggregator.
-     */
-    aggregator?: string;
-    /**
-     * The answer of the question given the table. If there is an aggregator, the answer will be
-     * preceded by `AGGREGATOR >`.
-     */
-    answer: string;
-    /**
-     * List of strings made up of the answer cell values.
-     */
-    cells: string[];
-    /**
-     * Coordinates of the cells of the answers.
-     */
-    coordinates: Array<number[]>;
-    [property: string]: unknown;
+	/**
+	 * If the model has an aggregator, this returns the aggregator.
+	 */
+	aggregator?: string;
+	/**
+	 * The answer of the question given the table. If there is an aggregator, the answer will be
+	 * preceded by `AGGREGATOR >`.
+	 */
+	answer: string;
+	/**
+	 * List of strings made up of the answer cell values.
+	 */
+	cells: string[];
+	/**
+	 * Coordinates of the cells of the answers.
+	 */
+	coordinates: Array<number[]>;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index 3ceb5c07a0..ee6fcbce5f 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -4,8 +4,9 @@
 	"description": "Inputs for Table Question Answering inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "One (table, question) pair to answer",
+			"title": "TableQuestionAnsweringInputData",
 			"type": "object",
 			"properties": {
 				"table": {
@@ -32,5 +33,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 6e09d5c7ce..5f4f466a04 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Classification inference
  */
 export interface TextClassificationInput {
-    /**
-     * The text to classify
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The text to classify
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,15 +25,15 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: TextClassificationOutputTransform;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: TextClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -44,13 +42,13 @@ export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 85c8468ad4..26d0bd9f1f 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The text to classify",
 			"type": "string"
 		},
@@ -42,5 +42,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index cd83abbb2e..13a09ff28b 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text Generation inference
  */
 export interface TextGenerationInput {
-    /**
-     * The text to initialize generation with
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The text to initialize generation with
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,63 +25,63 @@ export interface TextGenerationInput {
  * Additional inference parameters for Text Generation
  */
 export interface TextGenerationParameters {
-    /**
-     * Whether to use logit sampling (true) or greedy search (false).
-     */
-    doSample?: boolean;
-    /**
-     * Maximum number of generated tokens.
-     */
-    maxNewTokens?: number;
-    /**
-     * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
-     * paper](https://hf.co/papers/1909.05858) for more details.
-     */
-    repetitionPenalty?: number;
-    /**
-     * Whether to prepend the prompt to the generated text.
-     */
-    returnFullText?: boolean;
-    /**
-     * Stop generating tokens if a member of `stop_sequences` is generated.
-     */
-    stopSequences?: string[];
-    /**
-     * The value used to modulate the logits distribution.
-     */
-    temperature?: number;
-    /**
-     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
-     */
-    topK?: number;
-    /**
-     * If set to < 1, only the smallest set of most probable tokens with probabilities that add
-     * up to `top_p` or higher are kept for generation.
-     */
-    topP?: number;
-    /**
-     * Truncate input tokens to the given size.
-     */
-    truncate?: number;
-    /**
-     * Typical Decoding mass. See [Typical Decoding for Natural Language
-     * Generation](https://hf.co/papers/2202.00666) for more information
-     */
-    typicalP?: number;
-    /**
-     * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
-     */
-    watermark?: boolean;
-    [property: string]: unknown;
+	/**
+	 * Whether to use logit sampling (true) or greedy search (false).
+	 */
+	doSample?: boolean;
+	/**
+	 * Maximum number of generated tokens.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetitionPenalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	returnFullText?: boolean;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stopSequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typicalP?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs for Text Generation inference
  */
 export interface TextGenerationOutput {
-    /**
-     * The generated text
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index b1cf45995d..0c8bf8eaa1 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text Generation inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The text to initialize generation with",
 			"type": "string"
 		},
@@ -69,5 +69,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index 71e745c21d..be2a70bfd3 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -1,37 +1,35 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text To Audio inference
  */
 export interface TextToAudioInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToAudioOutput {
-    /**
-     * The generated audio waveform.
-     */
-    audio: unknown;
-    /**
-     * The sampling rate of the generated audio waveform.
-     */
-    samplingRate: number;
-    [property: string]: unknown;
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index a0802c1027..5c69ef1791 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text To Audio inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input text data",
 			"type": "string"
 		},
@@ -21,5 +21,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index be5c01981f..f119bc62f1 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text to Speech inference
  *
  * Inputs for Text To Audio inference
  */
 export interface TextToSpeechInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
@@ -29,13 +27,13 @@ export interface TextToSpeechInput {
  * Outputs of inference for the Text To Audio task
  */
 export interface TextToSpeechOutput {
-    /**
-     * The generated audio waveform.
-     */
-    audio: unknown;
-    /**
-     * The sampling rate of the generated audio waveform.
-     */
-    samplingRate: number;
-    [property: string]: unknown;
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index ce66234225..81c160e27a 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Text2text Generation inference
  */
 export interface Text2TextGenerationInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,19 +25,19 @@ export interface Text2TextGenerationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    generateParameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Text2TextGenerationTruncationStrategy;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -48,9 +46,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface Text2TextGenerationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index 495b5a2817..e54834e991 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Text2text Generation inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input text data",
 			"type": "string"
 		},
@@ -50,5 +50,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index 629c47c471..4584ca51de 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Token Classification inference
  */
 export interface TokenClassificationInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: TokenClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TokenClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,19 +25,19 @@ export interface TokenClassificationInput {
  * Additional inference parameters for Token Classification
  */
 export interface TokenClassificationParameters {
-    /**
-     * The strategy used to fuse tokens based on model predictions
-     */
-    aggregationStrategy?: TokenClassificationAggregationStrategy;
-    /**
-     * A list of labels to ignore
-     */
-    ignoreLabels?: string[];
-    /**
-     * The number of overlapping tokens between chunks when splitting the input text.
-     */
-    stride?: number;
-    [property: string]: unknown;
+	/**
+	 * The strategy used to fuse tokens based on model predictions
+	 */
+	aggregationStrategy?: TokenClassificationAggregationStrategy;
+	/**
+	 * A list of labels to ignore
+	 */
+	ignoreLabels?: string[];
+	/**
+	 * The number of overlapping tokens between chunks when splitting the input text.
+	 */
+	stride?: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -62,26 +60,26 @@ export type TokenClassificationAggregationStrategy = "none" | "simple" | "first"
  * Outputs of inference for the Token Classification task
  */
 export interface TokenClassificationOutput {
-    /**
-     * The character position in the input where this group ends.
-     */
-    end?: number;
-    /**
-     * The predicted label for that group of tokens
-     */
-    entityGroup?: string;
-    label:        unknown;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    /**
-     * The character position in the input where this group begins.
-     */
-    start?: number;
-    /**
-     * The corresponding text
-     */
-    word?: string;
-    [property: string]: unknown;
+	/**
+	 * The character position in the input where this group ends.
+	 */
+	end?: number;
+	/**
+	 * The predicted label for that group of tokens
+	 */
+	entityGroup?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	/**
+	 * The character position in the input where this group begins.
+	 */
+	start?: number;
+	/**
+	 * The corresponding text
+	 */
+	word?: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index f46b20cf9a..9b59fcb791 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Inputs for Token Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input text data",
 			"type": "string"
 		},
@@ -60,5 +60,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
index 96090808be..c932617a40 100644
--- a/packages/tasks/src/tasks/translation/inference.ts
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -1,26 +1,24 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Translation inference
  *
  * Inputs for Text2text Generation inference
  */
 export interface TranslationInput {
-    /**
-     * The input text data
-     */
-    input: string;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: Text2TextGenerationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -29,19 +27,19 @@ export interface TranslationInput {
  * Additional inference parameters for Text2text Generation
  */
 export interface Text2TextGenerationParameters {
-    /**
-     * Whether to clean up the potential extra spaces in the text output.
-     */
-    cleanUpTokenizationSpaces?: boolean;
-    /**
-     * Additional parametrization of the text generation algorithm
-     */
-    generateParameters?: { [key: string]: unknown };
-    /**
-     * The truncation strategy to use
-     */
-    truncation?: Text2TextGenerationTruncationStrategy;
-    [property: string]: unknown;
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
 }
 
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
@@ -52,9 +50,9 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
  * Outputs of inference for the Text2text Generation task
  */
 export interface TranslationOutput {
-    /**
-     * The generated text.
-     */
-    generatedText: string;
-    [property: string]: unknown;
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index 2d258b33be..1914bfda65 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -1,24 +1,22 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Video Classification inference
  */
 export interface VideoClassificationInput {
-    /**
-     * One or several videos to be classified
-     */
-    input: unknown;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: VideoClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input video data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VideoClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
@@ -27,23 +25,23 @@ export interface VideoClassificationInput {
  * Additional inference parameters for Video Classification
  */
 export interface VideoClassificationParameters {
-    /**
-     * The sampling rate used to select frames from the video.
-     */
-    frameSamplingRate?: number;
-    /**
-     * The function to apply to the model outputs in order to retrieve the scores.
-     */
-    functionToApply?: TextClassificationOutputTransform;
-    /**
-     * The number of sampled frames to consider for classification.
-     */
-    numFrames?: number;
-    /**
-     * When specified, limits the output to the top K most probable classes.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The sampling rate used to select frames from the video.
+	 */
+	frameSamplingRate?: number;
+	/**
+	 * The function to apply to the model outputs in order to retrieve the scores.
+	 */
+	functionToApply?: TextClassificationOutputTransform;
+	/**
+	 * The number of sampled frames to consider for classification.
+	 */
+	numFrames?: number;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
@@ -52,13 +50,13 @@ export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutput {
-    /**
-     * The predicted class label (model specific).
-     */
-    label: string;
-    /**
-     * The corresponding probability.
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 796ce393fa..c05a8b1113 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -4,8 +4,8 @@
 	"description": "Inputs for Video Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
-			"description": "One or several videos to be classified"
+		"data": {
+			"description": "The input video data"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -49,5 +49,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 7d192a33d0..0b0ee2e5a8 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Visual Question Answering inference
  */
 export interface VisualQuestionAnsweringInput {
-    /**
-     * One (image, question) pair to answer
-     */
-    input: VisualQuestionAnsweringInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: VisualQuestionAnsweringParameters;
-    [property: string]: unknown;
+	/**
+	 * One (image, question) pair to answer
+	 */
+	data: VisualQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VisualQuestionAnsweringParameters;
+	[property: string]: unknown;
 }
 
 /**
  * One (image, question) pair to answer
  */
-export interface VisualQuestionAnsweringInputSingle {
-    /**
-     * The image.
-     */
-    image: unknown;
-    /**
-     * The question to answer based on the image.
-     */
-    question: unknown;
-    [property: string]: unknown;
+export interface VisualQuestionAnsweringInputData {
+	/**
+	 * The image.
+	 */
+	image: unknown;
+	/**
+	 * The question to answer based on the image.
+	 */
+	question: unknown;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,27 +40,27 @@ export interface VisualQuestionAnsweringInputSingle {
  * Additional inference parameters for Visual Question Answering
  */
 export interface VisualQuestionAnsweringParameters {
-    /**
-     * The number of answers to return (will be chosen by order of likelihood). Note that we
-     * return less than topk answers if there are not enough options available within the
-     * context.
-     */
-    topK?: number;
-    [property: string]: unknown;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Visual Question Answering task
  */
 export interface VisualQuestionAnsweringOutput {
-    /**
-     * The answer to the question
-     */
-    answer?: string;
-    label:   unknown;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The answer to the question
+	 */
+	answer?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index 2e77422d9d..3a54c69fab 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -4,10 +4,10 @@
 	"description": "Inputs for Visual Question Answering inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "One (image, question) pair to answer",
 			"type": "object",
-			"title": "VisualQuestionAnsweringInputSingle",
+			"title": "VisualQuestionAnsweringInputData",
 			"properties": {
 				"image": {
 					"description": "The image."
@@ -36,5 +36,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 578f24946d..369474a6da 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Classification inference
  */
 export interface ZeroShotClassificationInput {
-    /**
-     * The input text data, with candidate labels
-     */
-    input: InputObject;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ZeroShotClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input text data, with candidate labels
+	 */
+	data: ZeroShotClassificationInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
  * The input text data, with candidate labels
  */
-export interface InputObject {
-    /**
-     * The set of possible class labels to classify the text into.
-     */
-    candidateLabels: string[];
-    /**
-     * The text to classify
-     */
-    text: string;
-    [property: string]: unknown;
+export interface ZeroShotClassificationInputData {
+	/**
+	 * The set of possible class labels to classify the text into.
+	 */
+	candidateLabels: string[];
+	/**
+	 * The text to classify
+	 */
+	text: string;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,31 +40,31 @@ export interface InputObject {
  * Additional inference parameters for Zero Shot Classification
  */
 export interface ZeroShotClassificationParameters {
-    /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
-     */
-    hypothesisTemplate?: string;
-    /**
-     * Whether multiple candidate labels can be true. If false, the scores are normalized such
-     * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
-     * considered independent and probabilities are normalized for each candidate.
-     */
-    multiLabel?: boolean;
-    [property: string]: unknown;
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	/**
+	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
+	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+	 * considered independent and probabilities are normalized for each candidate.
+	 */
+	multiLabel?: boolean;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
 export interface ZeroShotClassificationOutput {
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
index ce10d0b616..d4d0ba00bd 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -4,10 +4,10 @@
 	"description": "Inputs for Zero Shot Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input text data, with candidate labels",
 			"type": "object",
-			"title": "ZeroShotClassificationInput",
+			"title": "ZeroShotClassificationInputData",
 			"properties": {
 				"text": {
 					"type": "string",
@@ -45,5 +45,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 970d7708b9..65649ff5a6 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -1,39 +1,37 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Image Classification inference
  */
 export interface ZeroShotImageClassificationInput {
-    /**
-     * The input image data, with candidate labels
-     */
-    input: ZeroShotImageClassificationInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: ZeroShotImageClassificationParameters;
-    [property: string]: unknown;
+	/**
+	 * The input image data, with candidate labels
+	 */
+	data: ZeroShotImageClassificationInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotImageClassificationParameters;
+	[property: string]: unknown;
 }
 
 /**
  * The input image data, with candidate labels
  */
-export interface ZeroShotImageClassificationInputSingle {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to classify
-     */
-    image: unknown;
-    [property: string]: unknown;
+export interface ZeroShotImageClassificationInputData {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to classify
+	 */
+	image: unknown;
+	[property: string]: unknown;
 }
 
 /**
@@ -42,25 +40,25 @@ export interface ZeroShotImageClassificationInputSingle {
  * Additional inference parameters for Zero Shot Image Classification
  */
 export interface ZeroShotImageClassificationParameters {
-    /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
-     */
-    hypothesisTemplate?: string;
-    [property: string]: unknown;
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
 export interface ZeroShotImageClassificationOutput {
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 07fffa5efe..44102978e3 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -4,10 +4,10 @@
 	"description": "Inputs for Zero Shot Image Classification inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data, with candidate labels",
 			"type": "object",
-			"title": "ZeroShotImageClassificationInputSingle",
+			"title": "ZeroShotImageClassificationInputData",
 			"properties": {
 				"image": {
 					"description": "The image data to classify"
@@ -40,5 +40,5 @@
 			}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 2e3e12f743..987662e24d 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,59 +1,57 @@
-
 /**
  * Inference code generated from the JSON schema spec in ./spec
- * 
+ *
  * Using src/scripts/inference-codegen
  */
 
-
 /**
  * Inputs for Zero Shot Object Detection inference
  */
 export interface ZeroShotObjectDetectionInput {
-    /**
-     * The input image data, with candidate labels
-     */
-    input: ZeroShotObjectDetectionInputSingle;
-    /**
-     * Additional inference parameters
-     */
-    parameters?: { [key: string]: unknown };
-    [property: string]: unknown;
+	/**
+	 * The input image data, with candidate labels
+	 */
+	data: ZeroShotObjectDetectionInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
 }
 
 /**
  * The input image data, with candidate labels
  */
-export interface ZeroShotObjectDetectionInputSingle {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to generate bounding boxes from
-     */
-    image: unknown;
-    [property: string]: unknown;
+export interface ZeroShotObjectDetectionInputData {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: unknown;
+	[property: string]: unknown;
 }
 
 /**
  * Outputs of inference for the Zero Shot Object Detection task
  */
 export interface ZeroShotObjectDetectionOutput {
-    /**
-     * The predicted bounding box. Coordinates are relative to the top left corner of the input
-     * image.
-     */
-    box: BoundingBox;
-    /**
-     * A candidate label
-     */
-    label: string;
-    /**
-     * The associated score / probability
-     */
-    score: number;
-    [property: string]: unknown;
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
 }
 
 /**
@@ -61,9 +59,9 @@ export interface ZeroShotObjectDetectionOutput {
  * image.
  */
 export interface BoundingBox {
-    xmax: number;
-    xmin: number;
-    ymax: number;
-    ymin: number;
-    [property: string]: unknown;
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index 7f72f3f9bb..417dc0a78a 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -4,10 +4,10 @@
 	"description": "Inputs for Zero Shot Object Detection inference",
 	"type": "object",
 	"properties": {
-		"input": {
+		"data": {
 			"description": "The input image data, with candidate labels",
 			"type": "object",
-			"title": "ZeroShotObjectDetectionInputSingle",
+			"title": "ZeroShotObjectDetectionInputData",
 			"properties": {
 				"image": {
 					"description": "The image data to generate bounding boxes from"
@@ -35,5 +35,5 @@
 			"properties": {}
 		}
 	},
-	"required": ["input"]
+	"required": ["data"]
 }

From 3a98f588031af625b798bd5a817052014d20b3e9 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 25 Jan 2024 11:44:19 +0100
Subject: [PATCH 32/51] enable explicit-unions when generating

---
 packages/tasks/src/scripts/inference-codegen.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index aa92ba5a43..fedb95a64b 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -62,7 +62,7 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 			"prefer-unions": true,
 			"prefer-const-values": true,
 			"prefer-unknown": true,
-			// "explicit-unions": true,
+			"explicit-unions": true,
 		},
 	});
 }

From e0a493957ddc587b814144694cdc40e147186a94 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Thu, 25 Jan 2024 11:46:59 +0100
Subject: [PATCH 33/51] tweaks

---
 .../document-question-answering/inference.ts  |  4 +-
 .../src/tasks/feature-extraction/inference.ts |  2 +
 .../tasks/feature-extraction/spec/input.json  |  2 +-
 .../tasks/feature-extraction/spec/output.json | 48 +------------------
 4 files changed, 8 insertions(+), 48 deletions(-)

diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 73eb58f1c4..4502a8ffb0 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -78,10 +78,12 @@ export interface DocumentQuestionAnsweringParameters {
 	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
 	 * skip the OCR step and use the provided bounding boxes instead.
 	 */
-	wordBoxes?: Array<number[] | string>;
+	wordBoxes?: WordBox[];
 	[property: string]: unknown;
 }
 
+export type WordBox = number[] | string;
+
 /**
  * Outputs of inference for the Document Question Answering task
  */
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
index 664b8fa331..22dc8dd1d9 100644
--- a/packages/tasks/src/tasks/feature-extraction/inference.ts
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -4,6 +4,8 @@
  * Using src/scripts/inference-codegen
  */
 
+export type FeatureExtractionOutput = unknown[];
+
 /**
  * Inputs for Text Embedding inference
  */
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index 0170a70cda..8bea845e6e 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -1,5 +1,5 @@
 {
-	"$id": "/inference/schemas/text-embedding/input.json",
+	"$id": "/inference/schemas/feature-extraction/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Embedding inference",
 	"type": "object",
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
index 54a29d10e2..b51788daaf 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/output.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -1,51 +1,7 @@
 {
 	"$id": "/inference/schemas/feature-extraction/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Outputs of inference for the Feature Extraction task",
+	"description": "The embedding for the input text, as a nested list (tensor) of floats",
 	"type": "array",
-	"title": "FeatureExtractionOutput",
-	"items": {
-		"description": "The features computed by the mode, as a nested list of floats",
-		"anyOf": [
-			{
-				"type": "number"
-			},
-			{
-				"type": "array",
-				"items": {
-					"anyOf": [
-						{
-							"type": "number"
-						},
-						{
-							"type": "array",
-							"items": {
-								"anyOf": [
-									{
-										"type": "number"
-									},
-									{
-										"type": "array",
-										"items": {
-											"anyOf": [
-												{
-													"type": "number"
-												},
-												{
-													"type": "array",
-													"items": {
-														"type": "number"
-													}
-												}
-											]
-										}
-									}
-								]
-							}
-						}
-					]
-				}
-			}
-		]
-	}
+	"title": "FeatureExtractionOutput"
 }

From 2d463999beb8e4b8054ebf7a87e8bf1459cbd37e Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 10:40:01 +0100
Subject: [PATCH 34/51] =?UTF-8?q?=F0=9F=A9=B9=20Don't=20use=20require=20in?=
 =?UTF-8?q?=20rootDirFinder?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index fedb95a64b..1988c4c0da 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -19,11 +19,8 @@ const rootDirFinder = function (): string {
 	while (level > 0) {
 		const currentPath = parts.slice(0, level).join("/");
 		console.debug(currentPath);
-		try {
-			require(`${currentPath}/package.json`);
+		if (pathExists(`${currentPath}/package.json`)) {
 			return path.normalize(currentPath);
-		} catch (err) {
-			/// noop
 		}
 		level--;
 	}

From c1151c0caf608edb01c90ff9726b728909cbb2f8 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 11:04:46 +0100
Subject: [PATCH 35/51] Explicit titles

---
 packages/tasks/src/tasks/audio-classification/spec/input.json    | 1 +
 .../tasks/src/tasks/automatic-speech-recognition/spec/input.json | 1 +
 .../src/tasks/automatic-speech-recognition/spec/output.json      | 1 +
 packages/tasks/src/tasks/depth-estimation/spec/input.json        | 1 +
 packages/tasks/src/tasks/depth-estimation/spec/output.json       | 1 +
 .../tasks/src/tasks/document-question-answering/spec/input.json  | 1 +
 .../tasks/src/tasks/document-question-answering/spec/output.json | 1 +
 packages/tasks/src/tasks/feature-extraction/spec/input.json      | 1 +
 packages/tasks/src/tasks/fill-mask/spec/input.json               | 1 +
 packages/tasks/src/tasks/fill-mask/spec/output.json              | 1 +
 packages/tasks/src/tasks/image-classification/spec/input.json    | 1 +
 packages/tasks/src/tasks/image-classification/spec/output.json   | 1 +
 packages/tasks/src/tasks/image-segmentation/spec/input.json      | 1 +
 packages/tasks/src/tasks/image-segmentation/spec/output.json     | 1 +
 packages/tasks/src/tasks/image-to-image/spec/input.json          | 1 +
 packages/tasks/src/tasks/image-to-image/spec/output.json         | 1 +
 packages/tasks/src/tasks/image-to-text/spec/input.json           | 1 +
 packages/tasks/src/tasks/image-to-text/spec/output.json          | 1 +
 packages/tasks/src/tasks/object-detection/spec/input.json        | 1 +
 packages/tasks/src/tasks/object-detection/spec/output.json       | 1 +
 packages/tasks/src/tasks/placeholder/spec/input.json             | 1 +
 packages/tasks/src/tasks/placeholder/spec/output.json            | 1 +
 packages/tasks/src/tasks/question-answering/spec/input.json      | 1 +
 packages/tasks/src/tasks/sentence-similarity/spec/input.json     | 1 +
 packages/tasks/src/tasks/summarization/spec/input.json           | 1 +
 packages/tasks/src/tasks/summarization/spec/output.json          | 1 +
 .../tasks/src/tasks/table-question-answering/spec/input.json     | 1 +
 .../tasks/src/tasks/table-question-answering/spec/output.json    | 1 +
 packages/tasks/src/tasks/text-classification/spec/input.json     | 1 +
 packages/tasks/src/tasks/text-classification/spec/output.json    | 1 +
 packages/tasks/src/tasks/text-generation/spec/input.json         | 1 +
 packages/tasks/src/tasks/text-generation/spec/output.json        | 1 +
 packages/tasks/src/tasks/text-to-audio/spec/input.json           | 1 +
 packages/tasks/src/tasks/text-to-audio/spec/output.json          | 1 +
 packages/tasks/src/tasks/text-to-speech/spec/input.json          | 1 +
 packages/tasks/src/tasks/text-to-speech/spec/output.json         | 1 +
 packages/tasks/src/tasks/text2text-generation/spec/input.json    | 1 +
 packages/tasks/src/tasks/text2text-generation/spec/output.json   | 1 +
 packages/tasks/src/tasks/token-classification/spec/input.json    | 1 +
 packages/tasks/src/tasks/token-classification/spec/output.json   | 1 +
 packages/tasks/src/tasks/translation/spec/input.json             | 1 +
 packages/tasks/src/tasks/translation/spec/output.json            | 1 +
 packages/tasks/src/tasks/video-classification/spec/input.json    | 1 +
 packages/tasks/src/tasks/video-classification/spec/output.json   | 1 +
 .../tasks/src/tasks/visual-question-answering/spec/input.json    | 1 +
 .../tasks/src/tasks/visual-question-answering/spec/output.json   | 1 +
 .../tasks/src/tasks/zero-shot-classification/spec/input.json     | 1 +
 .../tasks/src/tasks/zero-shot-classification/spec/output.json    | 1 +
 .../src/tasks/zero-shot-image-classification/spec/input.json     | 1 +
 .../src/tasks/zero-shot-image-classification/spec/output.json    | 1 +
 .../tasks/src/tasks/zero-shot-object-detection/spec/input.json   | 1 +
 .../tasks/src/tasks/zero-shot-object-detection/spec/output.json  | 1 +
 52 files changed, 52 insertions(+)

diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index d6cc4516c6..80e8651fe7 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/audio-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Audio Classification inference",
+	"title": "AudioClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index be24719663..f44075d56c 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/automatic-speech-recognition/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Automatic Speech Recognition inference",
+	"title": "AutomaticSpeechRecognitionInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
index a8b8af7822..72573986d7 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/automatic-speech-recognition/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Automatic Speech Recognition task",
+	"title": "AutomaticSpeechRecognitionOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
index e5553f126e..3d58c82ff6 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/input.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/depth-estimation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Depth Estimation inference",
+	"title": "DepthEstimationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/output.json b/packages/tasks/src/tasks/depth-estimation/spec/output.json
index c3ebebcc5d..72d6a714dd 100644
--- a/packages/tasks/src/tasks/depth-estimation/spec/output.json
+++ b/packages/tasks/src/tasks/depth-estimation/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/depth-estimation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Depth Estimation task",
+	"title": "DepthEstimationOutput",
 	"type": "array",
 	"items": {
 		"description": "The output depth labels"
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
index 2161614c4d..a607735e74 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/document-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Document Question Answering inference",
+	"title": "DocumentQuestionAnsweringInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/output.json b/packages/tasks/src/tasks/document-question-answering/spec/output.json
index 4c77527757..9f69584ae8 100644
--- a/packages/tasks/src/tasks/document-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/document-question-answering/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/document-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Document Question Answering task",
+	"title": "DocumentQuestionAnsweringOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
index 8bea845e6e..a61455f6ca 100644
--- a/packages/tasks/src/tasks/feature-extraction/spec/input.json
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/feature-extraction/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Embedding inference",
+	"title": "FeatureExtractionInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
index 0174dbd5b4..00def602ef 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/input.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/fill-mask/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Fill Mask inference",
+	"title": "FillMaskInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/fill-mask/spec/output.json b/packages/tasks/src/tasks/fill-mask/spec/output.json
index 3453d65d42..f8e91aeeaa 100644
--- a/packages/tasks/src/tasks/fill-mask/spec/output.json
+++ b/packages/tasks/src/tasks/fill-mask/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/fill-mask/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Fill Mask task",
+	"title": "FillMaskOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 00c6e8b9f7..1dee66b97b 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image Classification inference",
+	"title": "ImageClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
index da8a2a5c7a..a875898b64 100644
--- a/packages/tasks/src/tasks/image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image Classification task",
+	"title": "ImageClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
index cb0c8dd18c..ae4adc70e9 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/input.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-segmentation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image Segmentation inference",
+	"title": "ImageSegmentationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
index 4b7cb643c8..b20aa415e0 100644
--- a/packages/tasks/src/tasks/image-segmentation/spec/output.json
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-segmentation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image Segmentation task",
+	"title": "ImageSegmentationOutput",
 	"type": "array",
 	"items": {
 		"description": "A predicted mask / segment",
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index f95e74d3da..d91d6e6d4a 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-to-image/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image To Image inference",
+	"title": "ImageToImageInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
index d9c4f9bf21..5e55f5677a 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-to-image/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Image task",
+	"title": "ImageToImageOutput",
 	"type": "array",
 	"items": {
 		"description": "The output image"
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index a49b445fed..f06eb59f00 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-to-text/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Image To Text inference",
+	"title": "ImageToTextInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/image-to-text/spec/output.json b/packages/tasks/src/tasks/image-to-text/spec/output.json
index 81960cd222..e3283e34f7 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/image-to-text/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Text task",
+	"title": "ImageToTextOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
index 8593df43c3..e01ebf4965 100644
--- a/packages/tasks/src/tasks/object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/object-detection/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Object Detection inference",
+	"title": "ObjectDetectionInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
index 450d96ed28..20c92d5d30 100644
--- a/packages/tasks/src/tasks/object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/object-detection/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Object Detection task",
+	"title": "ObjectDetectionOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
index eb8b9b50bd..5c206baef3 100644
--- a/packages/tasks/src/tasks/placeholder/spec/input.json
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/<TASK_ID>/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for <TASK_ID> inference",
+	"title": "PlaceholderInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/placeholder/spec/output.json b/packages/tasks/src/tasks/placeholder/spec/output.json
index b4b4225f63..8e3e132941 100644
--- a/packages/tasks/src/tasks/placeholder/spec/output.json
+++ b/packages/tasks/src/tasks/placeholder/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/<TASK_ID>/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs for <TASK_ID> inference",
+	"title": "PlaceholderOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
index 92484661bb..088e77200b 100644
--- a/packages/tasks/src/tasks/question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Question Answering inference",
+	"title": "QuestionAnsweringInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
index 1141781e0d..8bb9e2e5ad 100644
--- a/packages/tasks/src/tasks/sentence-similarity/spec/input.json
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/sentence-similarity/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Sentence similarity inference",
+	"title": "SentenceSimilarityInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/summarization/spec/input.json b/packages/tasks/src/tasks/summarization/spec/input.json
index b7c09d1db8..629da31ea6 100644
--- a/packages/tasks/src/tasks/summarization/spec/input.json
+++ b/packages/tasks/src/tasks/summarization/spec/input.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text2text-generation/input.json",
 	"$id": "/inference/schemas/summarization/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SummarizationInput",
 	"description": "Inputs for Summarization inference"
 }
diff --git a/packages/tasks/src/tasks/summarization/spec/output.json b/packages/tasks/src/tasks/summarization/spec/output.json
index df7331ee64..9b1f8bf303 100644
--- a/packages/tasks/src/tasks/summarization/spec/output.json
+++ b/packages/tasks/src/tasks/summarization/spec/output.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text2text-generation/output.json",
 	"$id": "/inference/schemas/summarization/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SummarizationOutput",
 	"description": "Outputs for Summarization inference"
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index ee6fcbce5f..e3fc6db9f4 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/table-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Table Question Answering inference",
+	"title": "TableQuestionAnsweringInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/output.json b/packages/tasks/src/tasks/table-question-answering/spec/output.json
index 8649006478..9b43026ea1 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/table-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Table Question Answering task",
+	"title": "TableQuestionAnsweringOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 26d0bd9f1f..08bac5953c 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Classification inference",
+	"title": "TextClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
index 4e6d69ed99..b2b81acde6 100644
--- a/packages/tasks/src/tasks/text-classification/spec/output.json
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text Classification task",
+	"title": "TextClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index 0c8bf8eaa1..2235616913 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-generation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text Generation inference",
+	"title": "TextGenerationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
index 4f1eb95e55..eacb907e2c 100644
--- a/packages/tasks/src/tasks/text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-generation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs for Text Generation inference",
+	"title": "TextGenerationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index 5c69ef1791..1760609624 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-to-audio/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text To Audio inference",
+	"title": "TextToAudioInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
index f91a9563ef..b0a25bd9ad 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text-to-audio/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text To Audio task",
+	"title": "TextToAudioOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index dffbf7910e..7d2bac0924 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text-to-audio/input.json",
 	"$id": "/inference/schemas/text-to-speech/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TextToSpeechInput",
 	"description": "Inputs for Text to Speech inference"
 }
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
index 4678592e8a..91654e2b50 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/output.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text-to-audio/output.json",
 	"$id": "/inference/schemas/text-to-speech/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TextToSpeechOutput",
 	"description": "Outputs for Text to Speech inference"
 }
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
index e54834e991..a00ae575fc 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text2text-generation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Text2text Generation inference",
+	"title": "Text2TextGenerationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
index 190aa6014c..f60ba8933e 100644
--- a/packages/tasks/src/tasks/text2text-generation/spec/output.json
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/text2text-generation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Text2text Generation task",
+	"title": "Text2TextGenerationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
index 9b59fcb791..2fd89ce34c 100644
--- a/packages/tasks/src/tasks/token-classification/spec/input.json
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/token-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Token Classification inference",
+	"title": "TokenClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/token-classification/spec/output.json b/packages/tasks/src/tasks/token-classification/spec/output.json
index 7685b740b7..8522d972a2 100644
--- a/packages/tasks/src/tasks/token-classification/spec/output.json
+++ b/packages/tasks/src/tasks/token-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/token-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Token Classification task",
+	"title": "TokenClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/translation/spec/input.json b/packages/tasks/src/tasks/translation/spec/input.json
index e3aac752cb..0695bc6728 100644
--- a/packages/tasks/src/tasks/translation/spec/input.json
+++ b/packages/tasks/src/tasks/translation/spec/input.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text2text-generation/input.json",
 	"$id": "/inference/schemas/translation/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TranslationInput",
 	"description": "Inputs for Translation inference"
 }
diff --git a/packages/tasks/src/tasks/translation/spec/output.json b/packages/tasks/src/tasks/translation/spec/output.json
index 6dcb98077b..61b701db2c 100644
--- a/packages/tasks/src/tasks/translation/spec/output.json
+++ b/packages/tasks/src/tasks/translation/spec/output.json
@@ -2,5 +2,6 @@
 	"$ref": "/inference/schemas/text2text-generation/output.json",
 	"$id": "/inference/schemas/translation/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TranslationOutput",
 	"description": "Outputs for Translation inference"
 }
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index c05a8b1113..386992c9ad 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/video-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Video Classification inference",
+	"title": "VideoClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
index 7121e472fb..9220cdbaec 100644
--- a/packages/tasks/src/tasks/video-classification/spec/output.json
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/video-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Video Classification task",
+	"title": "VideoClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
index 3a54c69fab..b6cb0e123c 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/visual-question-answering/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Visual Question Answering inference",
+	"title": "VisualQuestionAnsweringInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/output.json b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
index 2005d9f2f3..32c9c6c26b 100644
--- a/packages/tasks/src/tasks/visual-question-answering/spec/output.json
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/visual-question-answering/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Visual Question Answering task",
+	"title": "VisualQuestionAnsweringOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
index d4d0ba00bd..689c22769c 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Classification inference",
+	"title": "ZeroShotClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
index 54f226d9d8..27ad4b00e2 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Classification task",
+	"title": "ZeroShotClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
index 44102978e3..d5b212918f 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-image-classification/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Image Classification inference",
+	"title": "ZeroShotImageClassificationInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
index 102944ebcc..2b0e78b84c 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-image-classification/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Image Classification task",
+	"title": "ZeroShotImageClassificationOutput",
 	"type": "array",
 	"items": {
 		"type": "object",
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
index 417dc0a78a..63dce00edb 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-object-detection/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Inputs for Zero Shot Object Detection inference",
+	"title": "ZeroShotObjectDetectionInput",
 	"type": "object",
 	"properties": {
 		"data": {
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
index 171e81120f..6293efc978 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -2,6 +2,7 @@
 	"$id": "/inference/schemas/zero-shot-object-detection/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Zero Shot Object Detection task",
+	"title": "ZeroShotObjectDetectionOutput",
 	"type": "array",
 	"items": {
 		"type": "object",

From 077a88f65be4136c101e3fb42ff380cb9feb4d20 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 12:37:34 +0100
Subject: [PATCH 36/51] Post-process hack to generate array types

---
 .../tasks/src/scripts/inference-codegen.ts    | 72 ++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 1988c4c0da..bf4d959ae9 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -3,6 +3,7 @@ import { quicktype, InputData, JSONSchemaInput, FetchingJSONSchemaStore } from "
 import * as fs from "fs/promises";
 import { existsSync as pathExists } from "fs";
 import * as path from "path";
+import * as ts from "typescript";
 
 const TYPESCRIPT_HEADER_FILE = `
 /**
@@ -18,7 +19,6 @@ const rootDirFinder = function (): string {
 	let level = parts.length - 1;
 	while (level > 0) {
 		const currentPath = parts.slice(0, level).join("/");
-		console.debug(currentPath);
 		if (pathExists(`${currentPath}/package.json`)) {
 			return path.normalize(currentPath);
 		}
@@ -64,6 +64,71 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 	});
 }
 
+async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
+	const source = ts.createSourceFile(
+		path.basename(path2generated),
+		await fs.readFile(path2generated, { encoding: "utf-8" }),
+		ts.ScriptTarget.ES2022
+	);
+	const exportedName = outputSpec.title;
+	if (outputSpec.type !== "array" || typeof exportedName !== "string") {
+		console.log("      Nothing to do");
+		return;
+	}
+	const topLevelNodes = source.getChildAt(0).getChildren();
+	const hasTypeAlias = topLevelNodes.some(
+		(node) =>
+			node.kind === ts.SyntaxKind.TypeAliasDeclaration &&
+			(node as ts.TypeAliasDeclaration).name.escapedText === exportedName
+	);
+	if (hasTypeAlias) {
+		return;
+	}
+
+	const interfaceDeclaration = topLevelNodes.find((node): node is ts.InterfaceDeclaration => {
+		if (node.kind === ts.SyntaxKind.InterfaceDeclaration) {
+			return (node as ts.InterfaceDeclaration).name.getText(source) === exportedName;
+		}
+		return false;
+	});
+	if (!interfaceDeclaration) {
+		console.log("      Nothing to do");
+		return;
+	}
+
+	console.log("      Inserting top-level array type alias...");
+
+	const updatedInterface = ts.factory.updateInterfaceDeclaration(
+		interfaceDeclaration,
+		interfaceDeclaration.modifiers,
+		ts.factory.createIdentifier(interfaceDeclaration.name.getText(source) + "Element"),
+		interfaceDeclaration.typeParameters,
+		interfaceDeclaration.heritageClauses,
+		interfaceDeclaration.members
+	);
+	const arrayDeclaration = ts.factory.createTypeAliasDeclaration(
+		[ts.factory.createModifier(ts.SyntaxKind.ExportKeyword)],
+		exportedName,
+		undefined,
+		ts.factory.createArrayTypeNode(ts.factory.createTypeReferenceNode(updatedInterface.name))
+	);
+
+	const printer = ts.createPrinter();
+
+	const newNodes = ts.factory.createNodeArray([
+		...topLevelNodes.filter((node) => node !== interfaceDeclaration),
+		arrayDeclaration,
+		updatedInterface,
+	]);
+
+	fs.writeFile(path2generated, printer.printList(ts.ListFormat.MultiLine, newNodes, source), {
+		flag: "w+",
+		encoding: "utf-8",
+	});
+
+	return;
+}
+
 async function main() {
 	const rootDir = rootDirFinder();
 	const tasksDir = path.join(rootDir, "src", "tasks");
@@ -96,6 +161,11 @@ async function main() {
 				encoding: "utf-8",
 			});
 		}
+
+		const outputSpec = JSON.parse(await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }));
+
+		console.log("   🩹 Post-processing the generated code");
+		await postProcessOutput(`${dirPath}/inference.ts`, outputSpec);
 	}
 	console.debug("✅ All done!");
 }

From 6b10c4d949112528a23bda474d11523d1095ed16 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 12:38:12 +0100
Subject: [PATCH 37/51] regenerate code

---
 .../tasks/audio-classification/inference.ts   |  7 +-
 .../automatic-speech-recognition/inference.ts |  9 +--
 .../document-question-answering/inference.ts  |  8 +--
 .../tasks/src/tasks/fill-mask/inference.ts    |  6 +-
 .../tasks/image-classification/inference.ts   |  7 +-
 .../src/tasks/image-segmentation/inference.ts |  7 +-
 .../src/tasks/image-to-text/inference.ts      |  6 +-
 .../src/tasks/object-detection/inference.ts   | 29 ++++----
 .../src/tasks/question-answering/inference.ts |  7 +-
 .../table-question-answering/inference.ts     | 14 ++--
 .../tasks/text-classification/inference.ts    |  7 +-
 .../src/tasks/text-generation/inference.ts    |  6 +-
 .../src/tasks/text-to-audio/inference.ts      |  9 +--
 .../tasks/text2text-generation/inference.ts   | 11 ++-
 .../tasks/token-classification/inference.ts   |  7 +-
 .../tasks/video-classification/inference.ts   |  7 +-
 .../visual-question-answering/inference.ts    |  7 +-
 .../zero-shot-classification/inference.ts     |  7 +-
 .../inference.ts                              |  7 +-
 .../zero-shot-object-detection/inference.ts   | 67 -------------------
 20 files changed, 64 insertions(+), 171 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index bfc7af54ec..6671cdf14e 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Audio Classification inference
  */
@@ -18,7 +17,6 @@ export interface AudioClassificationInput {
 	parameters?: AudioClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -35,13 +33,12 @@ export interface AudioClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
 export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
-
+export type AudioClassificationOutput = AudioClassificationOutputElement[];
 /**
  * Outputs for Audio Classification inference
  */
-export interface AudioClassificationOutput {
+export interface AudioClassificationOutputElement {
 	/**
 	 * The predicted class label (model specific).
 	 */
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index bf594e048b..d83c45af5d 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Automatic Speech Recognition inference
  */
@@ -15,14 +14,16 @@ export interface AutomaticSpeechRecognitionInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
+export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
-export interface AutomaticSpeechRecognitionOutput {
+export interface AutomaticSpeechRecognitionOutputElement {
 	/**
 	 * The recognized text.
 	 */
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
index 4502a8ffb0..cd2ab54051 100644
--- a/packages/tasks/src/tasks/document-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Document Question Answering inference
  */
@@ -18,7 +17,6 @@ export interface DocumentQuestionAnsweringInput {
 	parameters?: DocumentQuestionAnsweringParameters;
 	[property: string]: unknown;
 }
-
 /**
  * One (document, question) pair to answer
  */
@@ -33,7 +31,6 @@ export interface DocumentQuestionAnsweringInputData {
 	question: string;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -81,13 +78,12 @@ export interface DocumentQuestionAnsweringParameters {
 	wordBoxes?: WordBox[];
 	[property: string]: unknown;
 }
-
 export type WordBox = number[] | string;
-
+export type DocumentQuestionAnsweringOutput = DocumentQuestionAnsweringOutputElement[];
 /**
  * Outputs of inference for the Document Question Answering task
  */
-export interface DocumentQuestionAnsweringOutput {
+export interface DocumentQuestionAnsweringOutputElement {
 	/**
 	 * The answer to the question.
 	 */
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
index c51ba8ec96..b80383da64 100644
--- a/packages/tasks/src/tasks/fill-mask/inference.ts
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Fill Mask inference
  */
@@ -18,7 +17,6 @@ export interface FillMaskInput {
 	parameters?: FillMaskParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -38,11 +36,11 @@ export interface FillMaskParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
+export type FillMaskOutput = FillMaskOutputElement[];
 /**
  * Outputs of inference for the Fill Mask task
  */
-export interface FillMaskOutput {
+export interface FillMaskOutputElement {
 	/**
 	 * The corresponding probability
 	 */
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index de10f4731e..5a43acdf5d 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Image Classification inference
  */
@@ -18,7 +17,6 @@ export interface ImageClassificationInput {
 	parameters?: ImageClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -35,13 +33,12 @@ export interface ImageClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
 export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
-
+export type ImageClassificationOutput = ImageClassificationOutputElement[];
 /**
  * Outputs of inference for the Image Classification task
  */
-export interface ImageClassificationOutput {
+export interface ImageClassificationOutputElement {
 	/**
 	 * The predicted class label (model specific).
 	 */
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
index 366c998f33..b316715f54 100644
--- a/packages/tasks/src/tasks/image-segmentation/inference.ts
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Image Segmentation inference
  */
@@ -18,7 +17,6 @@ export interface ImageSegmentationInput {
 	parameters?: ImageSegmentationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -43,15 +41,14 @@ export interface ImageSegmentationParameters {
 	threshold?: number;
 	[property: string]: unknown;
 }
-
 export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
-
+export type ImageSegmentationOutput = ImageSegmentationOutputElement[];
 /**
  * Outputs of inference for the Image Segmentation task
  *
  * A predicted mask / segment
  */
-export interface ImageSegmentationOutput {
+export interface ImageSegmentationOutputElement {
 	/**
 	 * The label of the predicted segment
 	 */
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index 210b2d8788..cba7451392 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Image To Text inference
  */
@@ -18,7 +17,6 @@ export interface ImageToTextInput {
 	parameters?: ImageToTextParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -31,11 +29,11 @@ export interface ImageToTextParameters {
 	maxNewTokens?: number;
 	[property: string]: unknown;
 }
-
+export type ImageToTextOutput = ImageToTextOutputElement[];
 /**
  * Outputs of inference for the Image To Text task
  */
-export interface ImageToTextOutput {
+export interface ImageToTextOutputElement {
 	/**
 	 * The generated text.
 	 */
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
index f432d2cba5..9650c781e0 100644
--- a/packages/tasks/src/tasks/object-detection/inference.ts
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Object Detection inference
  */
@@ -18,7 +17,6 @@ export interface ObjectDetectionInput {
 	parameters?: ObjectDetectionParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -31,11 +29,22 @@ export interface ObjectDetectionParameters {
 	threshold?: number;
 	[property: string]: unknown;
 }
-
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
+}
+export type ObjectDetectionOutput = ObjectDetectionOutputElement[];
 /**
  * Outputs of inference for the Object Detection task
  */
-export interface ObjectDetectionOutput {
+export interface ObjectDetectionOutputElement {
 	/**
 	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
 	 * image.
@@ -51,15 +60,3 @@ export interface ObjectDetectionOutput {
 	score: number;
 	[property: string]: unknown;
 }
-
-/**
- * The predicted bounding box. Coordinates are relative to the top left corner of the input
- * image.
- */
-export interface BoundingBox {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: unknown;
-}
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
index 1895b1dd49..bffc71cc6e 100644
--- a/packages/tasks/src/tasks/question-answering/inference.ts
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Question Answering inference
  */
@@ -18,7 +17,6 @@ export interface QuestionAnsweringInput {
 	parameters?: QuestionAnsweringParameters;
 	[property: string]: unknown;
 }
-
 /**
  * One (context, question) pair to answer
  */
@@ -33,7 +31,6 @@ export interface QuestionAnsweringInputData {
 	question: string;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -77,11 +74,11 @@ export interface QuestionAnsweringParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
+export type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
 /**
  * Outputs of inference for the Question Answering task
  */
-export interface QuestionAnsweringOutput {
+export interface QuestionAnsweringOutputElement {
 	/**
 	 * The answer to the question.
 	 */
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index 836aab94df..ac04c6a32a 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Table Question Answering inference
  */
@@ -15,10 +14,11 @@ export interface TableQuestionAnsweringInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
 /**
  * One (table, question) pair to answer
  */
@@ -30,14 +30,16 @@ export interface TableQuestionAnsweringInputData {
 	/**
 	 * The table to serve as context for the questions
 	 */
-	table: { [key: string]: unknown };
+	table: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
+export type TableQuestionAnsweringOutput = TableQuestionAnsweringOutputElement[];
 /**
  * Outputs of inference for the Table Question Answering task
  */
-export interface TableQuestionAnsweringOutput {
+export interface TableQuestionAnsweringOutputElement {
 	/**
 	 * If the model has an aggregator, this returns the aggregator.
 	 */
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 5f4f466a04..19298ccd09 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text Classification inference
  */
@@ -18,7 +17,6 @@ export interface TextClassificationInput {
 	parameters?: TextClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -35,13 +33,12 @@ export interface TextClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
-
+export type TextClassificationOutput = TextClassificationOutputElement[];
 /**
  * Outputs of inference for the Text Classification task
  */
-export interface TextClassificationOutput {
+export interface TextClassificationOutputElement {
 	/**
 	 * The predicted class label (model specific).
 	 */
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 13a09ff28b..94279336c8 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text Generation inference
  */
@@ -18,7 +17,6 @@ export interface TextGenerationInput {
 	parameters?: TextGenerationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -74,11 +72,11 @@ export interface TextGenerationParameters {
 	watermark?: boolean;
 	[property: string]: unknown;
 }
-
+export type TextGenerationOutput = TextGenerationOutputElement[];
 /**
  * Outputs for Text Generation inference
  */
-export interface TextGenerationOutput {
+export interface TextGenerationOutputElement {
 	/**
 	 * The generated text
 	 */
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index be2a70bfd3..d6a05e0177 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text To Audio inference
  */
@@ -15,14 +14,16 @@ export interface TextToAudioInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: {
+		[key: string]: unknown;
+	};
 	[property: string]: unknown;
 }
-
+export type TextToAudioOutput = TextToAudioOutputElement[];
 /**
  * Outputs of inference for the Text To Audio task
  */
-export interface TextToAudioOutput {
+export interface TextToAudioOutputElement {
 	/**
 	 * The generated audio waveform.
 	 */
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
index 81c160e27a..788845dd24 100644
--- a/packages/tasks/src/tasks/text2text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Text2text Generation inference
  */
@@ -18,7 +17,6 @@ export interface Text2TextGenerationInput {
 	parameters?: Text2TextGenerationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -32,20 +30,21 @@ export interface Text2TextGenerationParameters {
 	/**
 	 * Additional parametrization of the text generation algorithm
 	 */
-	generateParameters?: { [key: string]: unknown };
+	generateParameters?: {
+		[key: string]: unknown;
+	};
 	/**
 	 * The truncation strategy to use
 	 */
 	truncation?: Text2TextGenerationTruncationStrategy;
 	[property: string]: unknown;
 }
-
 export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
-
+export type Text2TextGenerationOutput = Text2TextGenerationOutputElement[];
 /**
  * Outputs of inference for the Text2text Generation task
  */
-export interface Text2TextGenerationOutput {
+export interface Text2TextGenerationOutputElement {
 	/**
 	 * The generated text.
 	 */
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
index 4584ca51de..7a8da8dcfc 100644
--- a/packages/tasks/src/tasks/token-classification/inference.ts
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Token Classification inference
  */
@@ -18,7 +17,6 @@ export interface TokenClassificationInput {
 	parameters?: TokenClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -39,7 +37,6 @@ export interface TokenClassificationParameters {
 	stride?: number;
 	[property: string]: unknown;
 }
-
 /**
  * Do not aggregate tokens
  *
@@ -55,11 +52,11 @@ export interface TokenClassificationParameters {
  * across the word's tokens).
  */
 export type TokenClassificationAggregationStrategy = "none" | "simple" | "first" | "average" | "max";
-
+export type TokenClassificationOutput = TokenClassificationOutputElement[];
 /**
  * Outputs of inference for the Token Classification task
  */
-export interface TokenClassificationOutput {
+export interface TokenClassificationOutputElement {
 	/**
 	 * The character position in the input where this group ends.
 	 */
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index 1914bfda65..ede6a25e42 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Video Classification inference
  */
@@ -18,7 +17,6 @@ export interface VideoClassificationInput {
 	parameters?: VideoClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -43,13 +41,12 @@ export interface VideoClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
 export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
-
+export type VideoClassificationOutput = VideoClassificationOutputElement[];
 /**
  * Outputs of inference for the Video Classification task
  */
-export interface VideoClassificationOutput {
+export interface VideoClassificationOutputElement {
 	/**
 	 * The predicted class label (model specific).
 	 */
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
index 0b0ee2e5a8..0eb513ebf6 100644
--- a/packages/tasks/src/tasks/visual-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Visual Question Answering inference
  */
@@ -18,7 +17,6 @@ export interface VisualQuestionAnsweringInput {
 	parameters?: VisualQuestionAnsweringParameters;
 	[property: string]: unknown;
 }
-
 /**
  * One (image, question) pair to answer
  */
@@ -33,7 +31,6 @@ export interface VisualQuestionAnsweringInputData {
 	question: unknown;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -48,11 +45,11 @@ export interface VisualQuestionAnsweringParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-
+export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
 /**
  * Outputs of inference for the Visual Question Answering task
  */
-export interface VisualQuestionAnsweringOutput {
+export interface VisualQuestionAnsweringOutputElement {
 	/**
 	 * The answer to the question
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index 369474a6da..db7f0c8bbe 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Zero Shot Classification inference
  */
@@ -18,7 +17,6 @@ export interface ZeroShotClassificationInput {
 	parameters?: ZeroShotClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * The input text data, with candidate labels
  */
@@ -33,7 +31,6 @@ export interface ZeroShotClassificationInputData {
 	text: string;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -53,11 +50,11 @@ export interface ZeroShotClassificationParameters {
 	multiLabel?: boolean;
 	[property: string]: unknown;
 }
-
+export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
 /**
  * Outputs of inference for the Zero Shot Classification task
  */
-export interface ZeroShotClassificationOutput {
+export interface ZeroShotClassificationOutputElement {
 	/**
 	 * A candidate label
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 65649ff5a6..22308aabb7 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -3,7 +3,6 @@
  *
  * Using src/scripts/inference-codegen
  */
-
 /**
  * Inputs for Zero Shot Image Classification inference
  */
@@ -18,7 +17,6 @@ export interface ZeroShotImageClassificationInput {
 	parameters?: ZeroShotImageClassificationParameters;
 	[property: string]: unknown;
 }
-
 /**
  * The input image data, with candidate labels
  */
@@ -33,7 +31,6 @@ export interface ZeroShotImageClassificationInputData {
 	image: unknown;
 	[property: string]: unknown;
 }
-
 /**
  * Additional inference parameters
  *
@@ -47,11 +44,11 @@ export interface ZeroShotImageClassificationParameters {
 	hypothesisTemplate?: string;
 	[property: string]: unknown;
 }
-
+export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputElement[];
 /**
  * Outputs of inference for the Zero Shot Image Classification task
  */
-export interface ZeroShotImageClassificationOutput {
+export interface ZeroShotImageClassificationOutputElement {
 	/**
 	 * A candidate label
 	 */
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index 987662e24d..e69de29bb2 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -1,67 +0,0 @@
-/**
- * Inference code generated from the JSON schema spec in ./spec
- *
- * Using src/scripts/inference-codegen
- */
-
-/**
- * Inputs for Zero Shot Object Detection inference
- */
-export interface ZeroShotObjectDetectionInput {
-	/**
-	 * The input image data, with candidate labels
-	 */
-	data: ZeroShotObjectDetectionInputData;
-	/**
-	 * Additional inference parameters
-	 */
-	parameters?: { [key: string]: unknown };
-	[property: string]: unknown;
-}
-
-/**
- * The input image data, with candidate labels
- */
-export interface ZeroShotObjectDetectionInputData {
-	/**
-	 * The candidate labels for this image
-	 */
-	candidateLabels: string[];
-	/**
-	 * The image data to generate bounding boxes from
-	 */
-	image: unknown;
-	[property: string]: unknown;
-}
-
-/**
- * Outputs of inference for the Zero Shot Object Detection task
- */
-export interface ZeroShotObjectDetectionOutput {
-	/**
-	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
-	 * image.
-	 */
-	box: BoundingBox;
-	/**
-	 * A candidate label
-	 */
-	label: string;
-	/**
-	 * The associated score / probability
-	 */
-	score: number;
-	[property: string]: unknown;
-}
-
-/**
- * The predicted bounding box. Coordinates are relative to the top left corner of the input
- * image.
- */
-export interface BoundingBox {
-	xmax: number;
-	xmin: number;
-	ymax: number;
-	ymin: number;
-	[property: string]: unknown;
-}

From c35fe85d22159a803525c7740144d2418c3f0b32 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 12:55:09 +0100
Subject: [PATCH 38/51] =?UTF-8?q?e=F0=9F=93=9D=20Some=20comments?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index bf4d959ae9..ff26c128ce 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -63,6 +63,17 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
 		},
 	});
 }
+/**
+ * quicktype is unable to generate "top-level array types" that are defined in the output spec: https://github.com/glideapps/quicktype/issues/2481
+ * We have to use the TypeScript API to generate those types when required.
+ * This hacky function:
+ *   - looks for the generated interface for output types
+ *   - renames it with a `Element` suffix
+ *   - generates  type alias in the form `export type <OutputType> = <OutputType>Element[];
+ * 
+ * And writes that to the `inference.ts` file
+ *   
+ */
 
 async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
 	const source = ts.createSourceFile(

From 6f1a8b36af5ca1c626c678b9465149390958532a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 12:58:21 +0100
Subject: [PATCH 39/51] =?UTF-8?q?=F0=9F=92=84=20Lint?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/tasks/src/scripts/inference-codegen.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index ff26c128ce..c66d87b258 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -70,9 +70,9 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
  *   - looks for the generated interface for output types
  *   - renames it with a `Element` suffix
  *   - generates  type alias in the form `export type <OutputType> = <OutputType>Element[];
- * 
+ *
  * And writes that to the `inference.ts` file
- *   
+ *
  */
 
 async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {

From 9d25d281e911e1a20b830b3757ea5cd46ce29661 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 15:42:47 +0100
Subject: [PATCH 40/51] Add text-to-image pipeline

---
 .../src/tasks/text-to-image/inference.ts      | 69 +++++++++++++++++++
 .../src/tasks/text-to-image/spec/input.json   | 53 ++++++++++++++
 .../src/tasks/text-to-image/spec/output.json  | 15 ++++
 3 files changed, 137 insertions(+)
 create mode 100644 packages/tasks/src/tasks/text-to-image/inference.ts
 create mode 100644 packages/tasks/src/tasks/text-to-image/spec/input.json
 create mode 100644 packages/tasks/src/tasks/text-to-image/spec/output.json

diff --git a/packages/tasks/src/tasks/text-to-image/inference.ts b/packages/tasks/src/tasks/text-to-image/inference.ts
new file mode 100644
index 0000000000..bfe0433f38
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/inference.ts
@@ -0,0 +1,69 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Text To Image inference
+ */
+export interface TextToImageInput {
+	/**
+	 * The input text data (sometimes called "prompt"
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextToImageParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Image
+ */
+export interface TextToImageParameters {
+	/**
+	 * For diffusion models. A higher guidance scale value encourages the model to generate
+	 * images closely linked to the text prompt at the expense of lower image quality.
+	 */
+	guidanceScale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negativePrompt?: string[];
+	/**
+	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+	 * a higher quality image at the expense of slower inference.
+	 */
+	numInferenceSteps?: number;
+	/**
+	 * The size in pixel of the output image
+	 */
+	targetSize?: TargetSize;
+	[property: string]: unknown;
+}
+
+/**
+ * The size in pixel of the output image
+ */
+export interface TargetSize {
+	height: number;
+	width: number;
+	[property: string]: unknown;
+}
+
+/**
+ * Outputs of inference for the Text To Image task
+ */
+export type TextToImageOutput = unknown[] | boolean | number | number | null | TextToImageOutputObject | string;
+
+export interface TextToImageOutputObject {
+	/**
+	 * The generated image
+	 */
+	image: unknown;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/text-to-image/spec/input.json b/packages/tasks/src/tasks/text-to-image/spec/input.json
new file mode 100644
index 0000000000..32a076dd0a
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/spec/input.json
@@ -0,0 +1,53 @@
+{
+	"$id": "/inference/schemas/text-to-image/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Image inference",
+	"title": "TextToImageInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input text data (sometimes called \"prompt\"",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextToImageParameters"
+		}
+	},
+	"$defs": {
+		"TextToImageParameters": {
+			"title": "TextToImageParameters",
+			"description": "Additional inference parameters for Text To Image",
+			"type": "object",
+			"properties": {
+				"guidanceScale": {
+					"type": "number",
+					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
+				},
+				"negativePrompt": {
+					"type": "array",
+					"items": { "type": "string" },
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"numInferenceSteps": {
+					"type": "integer",
+					"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"targetSize": {
+					"type": "object",
+					"description": "The size in pixel of the output image",
+					"properties": {
+						"width": {
+							"type": "integer"
+						},
+						"height": {
+							"type": "integer"
+						}
+					},
+					"required": ["width", "height"]
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/text-to-image/spec/output.json b/packages/tasks/src/tasks/text-to-image/spec/output.json
new file mode 100644
index 0000000000..5ab3ee7879
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/spec/output.json
@@ -0,0 +1,15 @@
+{
+	"$id": "/inference/schemas/text-to-image/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Image task",
+	"title": "TextToImageOutput",
+	"type": "array",
+	"items": {
+		"properties": {
+			"image": {
+				"description": "The generated image"
+			}
+		},
+		"required": ["image"]
+	}
+}

From 499ed5f7f3f182e3178671e9327222ecdfdb60d4 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 15:44:42 +0100
Subject: [PATCH 41/51] Update image-to-image output

---
 packages/tasks/src/tasks/image-to-image/spec/output.json | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
index 5e55f5677a..af4eff8046 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/output.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -3,8 +3,10 @@
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "Outputs of inference for the Image To Image task",
 	"title": "ImageToImageOutput",
-	"type": "array",
-	"items": {
-		"description": "The output image"
+	"type": "object",
+	"properties": {
+		"image": {
+			"description": "The output image"
+		}
 	}
 }

From bf48f5e3426852499d3ec746911ae6bc5e291210 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 15:51:34 +0100
Subject: [PATCH 42/51] Update image-to-image inputs

---
 .../src/tasks/image-to-image/inference.ts     | 51 +++++++++++++++++--
 .../src/tasks/image-to-image/spec/input.json  | 31 ++++++++++-
 2 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
index f05e24b6e4..96a532b252 100644
--- a/packages/tasks/src/tasks/image-to-image/inference.ts
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -4,8 +4,6 @@
  * Using src/scripts/inference-codegen
  */
 
-export type ImageToImageOutput = unknown[];
-
 /**
  * Inputs for Image To Image inference
  */
@@ -17,6 +15,53 @@ export interface ImageToImageInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: unknown;
+	parameters?: ImageToImageParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Image
+ */
+export interface ImageToImageParameters {
+	/**
+	 * For diffusion models. A higher guidance scale value encourages the model to generate
+	 * images closely linked to the text prompt at the expense of lower image quality.
+	 */
+	guidanceScale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negativePrompt?: string[];
+	/**
+	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+	 * a higher quality image at the expense of slower inference.
+	 */
+	numInferenceSteps?: number;
+	/**
+	 * The size in pixel of the output image
+	 */
+	targetSize?: TargetSize;
+	[property: string]: unknown;
+}
+
+/**
+ * The size in pixel of the output image
+ */
+export interface TargetSize {
+	height: number;
+	width: number;
+	[property: string]: unknown;
+}
+
+/**
+ * Outputs of inference for the Image To Image task
+ */
+export interface ImageToImageOutput {
+	/**
+	 * The output image
+	 */
+	image?: unknown;
 	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
index d91d6e6d4a..11d4bee8af 100644
--- a/packages/tasks/src/tasks/image-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -16,7 +16,36 @@
 	"$defs": {
 		"ImageToImageParameters": {
 			"title": "ImageToImageParameters",
-			"description": "Additional inference parameters for Image To Image"
+			"description": "Additional inference parameters for Image To Image",
+			"type": "object",
+			"properties": {
+				"guidanceScale": {
+					"type": "number",
+					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
+				},
+				"negativePrompt": {
+					"type": "array",
+					"items": { "type": "string" },
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"numInferenceSteps": {
+					"type": "integer",
+					"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"targetSize": {
+					"type": "object",
+					"description": "The size in pixel of the output image",
+					"properties": {
+						"width": {
+							"type": "integer"
+						},
+						"height": {
+							"type": "integer"
+						}
+					},
+					"required": ["width", "height"]
+				}
+			}
 		}
 	},
 	"required": ["data"]

From 49a815101e22c4a8429934252cefe07e5517263a Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 16:34:17 +0100
Subject: [PATCH 43/51] Factorize generate parameters

---
 .../tasks/src/scripts/inference-codegen.ts    | 10 +++---
 .../automatic-speech-recognition/inference.ts | 32 +++++++++++++++++--
 .../spec/input.json                           | 11 ++++++-
 .../src/tasks/image-to-text/inference.ts      | 16 ++++++++++
 .../src/tasks/image-to-text/spec/input.json   |  4 +++
 packages/tasks/src/tasks/schema-utils.json    | 18 +++++++++++
 .../src/tasks/text-to-audio/inference.ts      | 28 ++++++++++++++--
 .../src/tasks/text-to-audio/spec/input.json   |  7 +++-
 .../src/tasks/text-to-speech/inference.ts     | 28 +++++++++++++++-
 9 files changed, 141 insertions(+), 13 deletions(-)
 create mode 100644 packages/tasks/src/tasks/schema-utils.json

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index c66d87b258..ac72ff9f70 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -74,7 +74,6 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
  * And writes that to the `inference.ts` file
  *
  */
-
 async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
 	const source = ts.createSourceFile(
 		path.basename(path2generated),
@@ -149,9 +148,12 @@ async function main() {
 			.filter((entry) => entry.name !== "placeholder")
 			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
 	);
-	const allSpecFiles = allTasks
-		.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
-		.filter((filepath) => pathExists(filepath));
+	const allSpecFiles = [
+		path.join(tasksDir, "schema-utils.json"),
+		...allTasks
+			.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
+			.filter((filepath) => pathExists(filepath)),
+	];
 
 	for (const { task, dirPath } of allTasks) {
 		const taskSpecDir = path.join(dirPath, "spec");
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index d83c45af5d..244b44b694 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -14,9 +14,35 @@ export interface AutomaticSpeechRecognitionInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: {
-		[key: string]: unknown;
-	};
+	parameters?: AutomaticSpeechRecognitionParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Automatic Speech Recognition
+ */
+export interface AutomaticSpeechRecognitionParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	/**
+	 * Whether to output corresponding timestamps with the generated text
+	 */
+	returnTimestamps?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * I can be the papa you'd be the mama
+	 */
+	temperature?: number;
 	[property: string]: unknown;
 }
 export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index f44075d56c..93621151e5 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -18,7 +18,16 @@
 			"title": "AutomaticSpeechRecognitionParameters",
 			"description": "Additional inference parameters for Automatic Speech Recognition",
 			"type": "object",
-			"properties": {}
+			"properties": {
+				"returnTimestamps": {
+					"type": "boolean",
+					"description": "Whether to output corresponding timestamps with the generated text"
+				},
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+				}
+			}
 		}
 	},
 	"required": ["data"]
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index cba7451392..c87a51ce32 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -23,12 +23,28 @@ export interface ImageToTextInput {
  * Additional inference parameters for Image To Text
  */
 export interface ImageToTextParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
 	/**
 	 * The amount of maximum tokens to generate.
 	 */
 	maxNewTokens?: number;
 	[property: string]: unknown;
 }
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * I can be the papa you'd be the mama
+	 */
+	temperature?: number;
+	[property: string]: unknown;
+}
 export type ImageToTextOutput = ImageToTextOutputElement[];
 /**
  * Outputs of inference for the Image To Text task
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index f06eb59f00..b074372fce 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -22,6 +22,10 @@
 				"maxNewTokens": {
 					"type": "integer",
 					"description": "The amount of maximum tokens to generate."
+				},
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
new file mode 100644
index 0000000000..60c833f60d
--- /dev/null
+++ b/packages/tasks/src/tasks/schema-utils.json
@@ -0,0 +1,18 @@
+{
+	"$id": "/inference/schemas/schema-utils.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Common type definitions shared by several tasks",
+	"definitions": {
+		"GenerationParameters": {
+			"title": "GenerationParameters",
+			"description": "Ad-hoc parametrization of the text generation process",
+			"type": "object",
+			"properties": {
+				"temperature": {
+					"type": "number",
+					"description": "I can be the papa you'd be the mama"
+				}
+			}
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index d6a05e0177..41796240a8 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -14,9 +14,31 @@ export interface TextToAudioInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: {
-		[key: string]: unknown;
-	};
+	parameters?: TextToAudioParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Audio
+ */
+export interface TextToAudioParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * I can be the papa you'd be the mama
+	 */
+	temperature?: number;
 	[property: string]: unknown;
 }
 export type TextToAudioOutput = TextToAudioOutputElement[];
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index 1760609624..d049fb02e7 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -19,7 +19,12 @@
 			"title": "TextToAudioParameters",
 			"description": "Additional inference parameters for Text To Audio",
 			"type": "object",
-			"properties": {}
+			"properties": {
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+				}
+			}
 		}
 	},
 	"required": ["data"]
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index f119bc62f1..a899740729 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -17,7 +17,33 @@ export interface TextToSpeechInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: { [key: string]: unknown };
+	parameters?: TextToAudioParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Audio
+ */
+export interface TextToAudioParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * I can be the papa you'd be the mama
+	 */
+	temperature?: number;
 	[property: string]: unknown;
 }
 

From e4f3d138493a59bb76a6d7371913c3768b48f1eb Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 16:42:14 +0100
Subject: [PATCH 44/51] Correclty type ASR output

---
 .../automatic-speech-recognition/inference.ts | 16 ++++++++++++++++
 .../spec/output.json                          | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index 244b44b694..ee17e64f4e 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -45,11 +45,27 @@ export interface GenerationParameters {
 	temperature?: number;
 	[property: string]: unknown;
 }
+export interface AutomaticSpeechRecognitionOutputChunk {
+	/**
+	 * A chunk of text identified by the model
+	 */
+	text: string;
+	/**
+	 * The start and end timestamps corresponding with the text
+	 */
+	timestamps: number[];
+	[property: string]: unknown;
+}
 export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
 /**
  * Outputs of inference for the Automatic Speech Recognition task
  */
 export interface AutomaticSpeechRecognitionOutputElement {
+	/**
+	 * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
+	 * the model.
+	 */
+	chunks?: AutomaticSpeechRecognitionOutputChunk[];
 	/**
 	 * The recognized text.
 	 */
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
index 72573986d7..217f210b15 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -10,6 +10,25 @@
 			"text": {
 				"type": "string",
 				"description": "The recognized text."
+			},
+			"chunks": {
+				"type": "array",
+				"description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
+				"items": {
+					"type": "object",
+					"title": "AutomaticSpeechRecognitionOutputChunk",
+					"properties": {
+						"text": { "type": "string", "description": "A chunk of text identified by the model" },
+						"timestamps": {
+							"type": "array",
+							"description": "The start and end timestamps corresponding with the text",
+							"items": { "type": "number" },
+							"minLength": 2,
+							"maxLength": 2
+						}
+					},
+					"required": ["text", "timestamps"]
+				}
 			}
 		},
 		"required": ["text"]

From 826181a63831cbe92c97a4e23772fcb9d47967f6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:03:24 +0100
Subject: [PATCH 45/51] wip: spec generate parameters

---
 .../automatic-speech-recognition/inference.ts | 82 +++++++++++++++++-
 .../src/tasks/image-to-text/inference.ts      | 82 +++++++++++++++++-
 packages/tasks/src/tasks/schema-utils.json    | 64 +++++++++++++-
 .../src/tasks/text-to-audio/inference.ts      | 82 +++++++++++++++++-
 .../src/tasks/text-to-speech/inference.ts     | 83 ++++++++++++++++++-
 5 files changed, 387 insertions(+), 6 deletions(-)

diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
index ee17e64f4e..d9e2adc859 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -40,11 +40,91 @@ export interface AutomaticSpeechRecognitionParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export interface AutomaticSpeechRecognitionOutputChunk {
 	/**
 	 * A chunk of text identified by the model
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
index c87a51ce32..19bb147e2d 100644
--- a/packages/tasks/src/tasks/image-to-text/inference.ts
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -40,11 +40,91 @@ export interface ImageToTextParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export type ImageToTextOutput = ImageToTextOutputElement[];
 /**
  * Outputs of inference for the Image To Text task
diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
index 60c833f60d..5a3d3e8120 100644
--- a/packages/tasks/src/tasks/schema-utils.json
+++ b/packages/tasks/src/tasks/schema-utils.json
@@ -1,7 +1,7 @@
 {
 	"$id": "/inference/schemas/schema-utils.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Common type definitions shared by several tasks",
+	"description": "(Incomplete!) Common type definitions shared by several tasks",
 	"definitions": {
 		"GenerationParameters": {
 			"title": "GenerationParameters",
@@ -10,7 +10,67 @@
 			"properties": {
 				"temperature": {
 					"type": "number",
-					"description": "I can be the papa you'd be the mama"
+					"description": "The value used to modulate the next token probabilities."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
+				},
+				"epsilonCutoff": {
+					"type": "number",
+					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"etaCutoff": {
+					"type": "number",
+					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"maxLength": {
+					"type": "integer",
+					"description": "The maximum length (in tokens) of the generated text, including the input."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"minLength": {
+					"type": "integer",
+					"description": "The minimum length (in tokens) of the generated text, including the input."
+				},
+				"minNewTokens": {
+					"type": "integer",
+					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
+				},
+				"earlyStopping": {
+					"description": "Controls the stopping condition for beam-based methods.",
+					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
+				},
+				"numBeams": {
+					"type": "integer",
+					"description": "Number of beams to use for beam search."
+				},
+				"numBeamGroups": {
+					"type": "integer",
+					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
+				},
+				"penaltyAlpha": {
+					"type": "number",
+					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
+				},
+				"useCache": {
+					"type": "boolean",
+					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
index 41796240a8..14c484bf2f 100644
--- a/packages/tasks/src/tasks/text-to-audio/inference.ts
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -36,11 +36,91 @@ export interface TextToAudioParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
 export type TextToAudioOutput = TextToAudioOutputElement[];
 /**
  * Outputs of inference for the Text To Audio task
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index a899740729..f67e03652a 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -41,12 +41,93 @@ export interface TextToAudioParameters {
  */
 export interface GenerationParameters {
 	/**
-	 * I can be the papa you'd be the mama
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
 	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
 	[property: string]: unknown;
 }
 
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
+
 /**
  * Outputs for Text to Speech inference
  *

From 0000f02cfd68b86bd9865e396a3672a53a370225 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:15:11 +0100
Subject: [PATCH 46/51] =?UTF-8?q?e=E2=99=BB=EF=B8=8F=20Factorize=20common?=
 =?UTF-8?q?=20classification=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tasks/audio-classification/inference.ts   | 10 +++---
 .../audio-classification/spec/input.json      | 14 +--------
 .../tasks/image-classification/inference.ts   | 12 +++----
 .../image-classification/spec/input.json      | 14 +--------
 .../image-classification/spec/output.json     | 13 +-------
 packages/tasks/src/tasks/schema-utils.json    | 31 +++++++++++++++++++
 .../tasks/text-classification/inference.ts    | 12 +++----
 .../tasks/text-classification/spec/input.json | 14 +--------
 .../text-classification/spec/output.json      | 13 +-------
 .../tasks/video-classification/inference.ts   | 12 +++----
 .../video-classification/spec/input.json      | 14 +--------
 .../video-classification/spec/output.json     | 13 +-------
 .../zero-shot-classification/inference.ts     |  4 +--
 .../zero-shot-classification/spec/output.json | 13 +-------
 .../inference.ts                              |  4 +--
 .../spec/output.json                          | 13 +-------
 16 files changed, 67 insertions(+), 139 deletions(-)

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
index 6671cdf14e..ae37f29acf 100644
--- a/packages/tasks/src/tasks/audio-classification/inference.ts
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -23,17 +23,17 @@ export interface AudioClassificationInput {
  * Additional inference parameters for Audio Classification
  */
 export interface AudioClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: AudioClassificationOutputTransform;
+	functionToApply?: ClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
 	topK?: number;
 	[property: string]: unknown;
 }
-export type AudioClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 export type AudioClassificationOutput = AudioClassificationOutputElement[];
 /**
  * Outputs for Audio Classification inference
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 80e8651fe7..4bf3639e6a 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -21,19 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "AudioClassificationOutputTransform",
-					"type": "string",
-					"description": "The function to apply to the model outputs in order to retrieve the scores.",
-					"oneOf": [
-						{
-							"const": "sigmoid"
-						},
-						{
-							"const": "softmax"
-						},
-						{
-							"const": "none"
-						}
-					]
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
index 5a43acdf5d..7138a50735 100644
--- a/packages/tasks/src/tasks/image-classification/inference.ts
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -23,24 +23,24 @@ export interface ImageClassificationInput {
  * Additional inference parameters for Image Classification
  */
 export interface ImageClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: ImageClassificationOutputTransform;
+	functionToApply?: ClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
 	topK?: number;
 	[property: string]: unknown;
 }
-export type ImageClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 export type ImageClassificationOutput = ImageClassificationOutputElement[];
 /**
  * Outputs of inference for the Image Classification task
  */
 export interface ImageClassificationOutputElement {
 	/**
-	 * The predicted class label (model specific).
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 1dee66b97b..081c05a5f8 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -21,19 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "ImageClassificationOutputTransform",
-					"type": "string",
-					"description": "The function to apply to the model outputs in order to retrieve the scores.",
-					"oneOf": [
-						{
-							"const": "sigmoid"
-						},
-						{
-							"const": "softmax"
-						},
-						{
-							"const": "none"
-						}
-					]
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
index a875898b64..b7b8ed4246 100644
--- a/packages/tasks/src/tasks/image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "ImageClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "The predicted class label (model specific)."
-			},
-			"score": {
-				"type": "number",
-				"description": "The corresponding probability."
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
index 5a3d3e8120..49157797a7 100644
--- a/packages/tasks/src/tasks/schema-utils.json
+++ b/packages/tasks/src/tasks/schema-utils.json
@@ -3,6 +3,37 @@
 	"$schema": "http://json-schema.org/draft-06/schema#",
 	"description": "(Incomplete!) Common type definitions shared by several tasks",
 	"definitions": {
+		"ClassificationOutputTransform": {
+			"title": "ClassificationOutputTransform",
+			"type": "string",
+			"description": "The function to apply to the model outputs in order to retrieve the scores.",
+			"oneOf": [
+				{
+					"const": "sigmoid"
+				},
+				{
+					"const": "softmax"
+				},
+				{
+					"const": "none"
+				}
+			]
+		},
+		"ClassificationOutput": {
+			"title": "ClassificationOutput",
+			"type": "object",
+			"properties": {
+				"label": {
+					"type": "string",
+					"description": "The predicted class label."
+				},
+				"score": {
+					"type": "number",
+					"description": "The corresponding probability."
+				}
+			},
+			"required": ["label", "score"]
+		},
 		"GenerationParameters": {
 			"title": "GenerationParameters",
 			"description": "Ad-hoc parametrization of the text generation process",
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
index 19298ccd09..9bc728a50c 100644
--- a/packages/tasks/src/tasks/text-classification/inference.ts
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -23,24 +23,24 @@ export interface TextClassificationInput {
  * Additional inference parameters for Text Classification
  */
 export interface TextClassificationParameters {
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: TextClassificationOutputTransform;
+	functionToApply?: ClassificationOutputTransform;
 	/**
 	 * When specified, limits the output to the top K most probable classes.
 	 */
 	topK?: number;
 	[property: string]: unknown;
 }
-export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 export type TextClassificationOutput = TextClassificationOutputElement[];
 /**
  * Outputs of inference for the Text Classification task
  */
 export interface TextClassificationOutputElement {
 	/**
-	 * The predicted class label (model specific).
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 08bac5953c..87031422c1 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -22,19 +22,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "TextClassificationOutputTransform",
-					"type": "string",
-					"description": "The function to apply to the model outputs in order to retrieve the scores.",
-					"oneOf": [
-						{
-							"const": "sigmoid"
-						},
-						{
-							"const": "softmax"
-						},
-						{
-							"const": "none"
-						}
-					]
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
index b2b81acde6..95d1ca5ee3 100644
--- a/packages/tasks/src/tasks/text-classification/spec/output.json
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "TextClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "The predicted class label (model specific)."
-			},
-			"score": {
-				"type": "number",
-				"description": "The corresponding probability."
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
index ede6a25e42..1f765160f3 100644
--- a/packages/tasks/src/tasks/video-classification/inference.ts
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -27,10 +27,7 @@ export interface VideoClassificationParameters {
 	 * The sampling rate used to select frames from the video.
 	 */
 	frameSamplingRate?: number;
-	/**
-	 * The function to apply to the model outputs in order to retrieve the scores.
-	 */
-	functionToApply?: TextClassificationOutputTransform;
+	functionToApply?: ClassificationOutputTransform;
 	/**
 	 * The number of sampled frames to consider for classification.
 	 */
@@ -41,14 +38,17 @@ export interface VideoClassificationParameters {
 	topK?: number;
 	[property: string]: unknown;
 }
-export type TextClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
 export type VideoClassificationOutput = VideoClassificationOutputElement[];
 /**
  * Outputs of inference for the Video Classification task
  */
 export interface VideoClassificationOutputElement {
 	/**
-	 * The predicted class label (model specific).
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 386992c9ad..0607bbf522 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -21,19 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "TextClassificationOutputTransform",
-					"type": "string",
-					"description": "The function to apply to the model outputs in order to retrieve the scores.",
-					"oneOf": [
-						{
-							"const": "sigmoid"
-						},
-						{
-							"const": "softmax"
-						},
-						{
-							"const": "none"
-						}
-					]
+					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
 				},
 				"numFrames": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
index 9220cdbaec..247aae997e 100644
--- a/packages/tasks/src/tasks/video-classification/spec/output.json
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "VideoClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "The predicted class label (model specific)."
-			},
-			"score": {
-				"type": "number",
-				"description": "The corresponding probability."
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
index db7f0c8bbe..e0b43ec70b 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -56,11 +56,11 @@ export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[]
  */
 export interface ZeroShotClassificationOutputElement {
 	/**
-	 * A candidate label
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
-	 * The associated score / probability
+	 * The corresponding probability.
 	 */
 	score: number;
 	[property: string]: unknown;
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
index 27ad4b00e2..d9e78c2319 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "ZeroShotClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "A candidate label"
-			},
-			"score": {
-				"type": "number",
-				"description": "The associated score / probability"
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
index 22308aabb7..2bea5436b8 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -50,11 +50,11 @@ export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutpu
  */
 export interface ZeroShotImageClassificationOutputElement {
 	/**
-	 * A candidate label
+	 * The predicted class label.
 	 */
 	label: string;
 	/**
-	 * The associated score / probability
+	 * The corresponding probability.
 	 */
 	score: number;
 	[property: string]: unknown;
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
index 2b0e78b84c..68a5ecfb03 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -5,17 +5,6 @@
 	"title": "ZeroShotImageClassificationOutput",
 	"type": "array",
 	"items": {
-		"type": "object",
-		"properties": {
-			"label": {
-				"type": "string",
-				"description": "A candidate label"
-			},
-			"score": {
-				"type": "number",
-				"description": "The associated score / probability"
-			}
-		},
-		"required": ["label", "score"]
+		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
 	}
 }

From 8dc4d172d21d77023a37a5627b273982c3adcdc2 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:17:35 +0100
Subject: [PATCH 47/51] fix: await writefile in post process

---
 .../tasks/src/scripts/inference-codegen.ts    |  2 +-
 .../zero-shot-object-detection/inference.ts   | 66 +++++++++++++++++++
 .../spec/output.json                          |  1 +
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index ac72ff9f70..8ad7fc3e0f 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -131,7 +131,7 @@ async function postProcessOutput(path2generated: string, outputSpec: Record<stri
 		updatedInterface,
 	]);
 
-	fs.writeFile(path2generated, printer.printList(ts.ListFormat.MultiLine, newNodes, source), {
+	await fs.writeFile(path2generated, printer.printList(ts.ListFormat.MultiLine, newNodes, source), {
 		flag: "w+",
 		encoding: "utf-8",
 	});
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
index e69de29bb2..edb51172ec 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -0,0 +1,66 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Zero Shot Object Detection inference
+ */
+export interface ZeroShotObjectDetectionInput {
+	/**
+	 * The input image data, with candidate labels
+	 */
+	data: ZeroShotObjectDetectionInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: {
+		[key: string]: unknown;
+	};
+	[property: string]: unknown;
+}
+/**
+ * The input image data, with candidate labels
+ */
+export interface ZeroShotObjectDetectionInputData {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: unknown;
+	[property: string]: unknown;
+}
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
+}
+export type ZeroShotObjectDetectionOutput = ZeroShotObjectDetectionOutputElement[];
+/**
+ * Outputs of inference for the Zero Shot Object Detection task
+ */
+export interface ZeroShotObjectDetectionOutputElement {
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
index 6293efc978..8afa605276 100644
--- a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -6,6 +6,7 @@
 	"type": "array",
 	"items": {
 		"type": "object",
+		"title": "ZeroShotObjectDetectionOutputElement",
 		"properties": {
 			"label": {
 				"type": "string",

From 9ccb3a44dbb2234d747b7cc067c772a729794f2d Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:19:40 +0100
Subject: [PATCH 48/51] add scheduler param

---
 packages/tasks/src/tasks/text-to-image/spec/input.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/packages/tasks/src/tasks/text-to-image/spec/input.json b/packages/tasks/src/tasks/text-to-image/spec/input.json
index 32a076dd0a..cb1e1c6cf6 100644
--- a/packages/tasks/src/tasks/text-to-image/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-image/spec/input.json
@@ -45,6 +45,10 @@
 						}
 					},
 					"required": ["width", "height"]
+				},
+				"scheduler": {
+					"type": "string",
+					"description": "For diffusion models. Override the scheduler with a compatible one"
 				}
 			}
 		}

From accdeffba68542882b8f19921dd1c30a5ee25c16 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:21:59 +0100
Subject: [PATCH 49/51] rename schema-utls to common-definitions

---
 .../tasks/src/scripts/inference-codegen.ts    |   2 +-
 .../audio-classification/spec/input.json      |   2 +-
 .../spec/input.json                           |   2 +-
 .../image-classification/spec/input.json      |   2 +-
 .../image-classification/spec/output.json     |   2 +-
 .../src/tasks/image-to-text/spec/input.json   |   2 +-
 packages/tasks/src/tasks/schema-utils.json    | 109 ------------------
 .../tasks/text-classification/spec/input.json |   2 +-
 .../text-classification/spec/output.json      |   2 +-
 .../src/tasks/text-to-audio/spec/input.json   |   2 +-
 .../src/tasks/text-to-image/inference.ts      |   4 +
 .../video-classification/spec/input.json      |   2 +-
 .../video-classification/spec/output.json     |   2 +-
 .../zero-shot-classification/spec/output.json |   2 +-
 .../spec/output.json                          |   2 +-
 15 files changed, 17 insertions(+), 122 deletions(-)
 delete mode 100644 packages/tasks/src/tasks/schema-utils.json

diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
index 8ad7fc3e0f..02c8e30031 100644
--- a/packages/tasks/src/scripts/inference-codegen.ts
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -149,7 +149,7 @@ async function main() {
 			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
 	);
 	const allSpecFiles = [
-		path.join(tasksDir, "schema-utils.json"),
+		path.join(tasksDir, "common-definitions.json"),
 		...allTasks
 			.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
 			.filter((filepath) => pathExists(filepath)),
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
index 4bf3639e6a..cfd5a54a6f 100644
--- a/packages/tasks/src/tasks/audio-classification/spec/input.json
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -21,7 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "AudioClassificationOutputTransform",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
index 93621151e5..2d31957ed2 100644
--- a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -25,7 +25,7 @@
 				},
 				"generate": {
 					"description": "Parametrization of the text generation process",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
index 081c05a5f8..362c0d5171 100644
--- a/packages/tasks/src/tasks/image-classification/spec/input.json
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -21,7 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "ImageClassificationOutputTransform",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
index b7b8ed4246..2a3264bce7 100644
--- a/packages/tasks/src/tasks/image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "ImageClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
index b074372fce..0ef8ba1dc5 100644
--- a/packages/tasks/src/tasks/image-to-text/spec/input.json
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -25,7 +25,7 @@
 				},
 				"generate": {
 					"description": "Parametrization of the text generation process",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/schema-utils.json b/packages/tasks/src/tasks/schema-utils.json
deleted file mode 100644
index 49157797a7..0000000000
--- a/packages/tasks/src/tasks/schema-utils.json
+++ /dev/null
@@ -1,109 +0,0 @@
-{
-	"$id": "/inference/schemas/schema-utils.json",
-	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "(Incomplete!) Common type definitions shared by several tasks",
-	"definitions": {
-		"ClassificationOutputTransform": {
-			"title": "ClassificationOutputTransform",
-			"type": "string",
-			"description": "The function to apply to the model outputs in order to retrieve the scores.",
-			"oneOf": [
-				{
-					"const": "sigmoid"
-				},
-				{
-					"const": "softmax"
-				},
-				{
-					"const": "none"
-				}
-			]
-		},
-		"ClassificationOutput": {
-			"title": "ClassificationOutput",
-			"type": "object",
-			"properties": {
-				"label": {
-					"type": "string",
-					"description": "The predicted class label."
-				},
-				"score": {
-					"type": "number",
-					"description": "The corresponding probability."
-				}
-			},
-			"required": ["label", "score"]
-		},
-		"GenerationParameters": {
-			"title": "GenerationParameters",
-			"description": "Ad-hoc parametrization of the text generation process",
-			"type": "object",
-			"properties": {
-				"temperature": {
-					"type": "number",
-					"description": "The value used to modulate the next token probabilities."
-				},
-				"topK": {
-					"type": "integer",
-					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
-				},
-				"topP": {
-					"type": "number",
-					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
-				},
-				"typicalP": {
-					"type": "number",
-					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
-				},
-				"epsilonCutoff": {
-					"type": "number",
-					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
-				},
-				"etaCutoff": {
-					"type": "number",
-					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
-				},
-				"maxLength": {
-					"type": "integer",
-					"description": "The maximum length (in tokens) of the generated text, including the input."
-				},
-				"maxNewTokens": {
-					"type": "integer",
-					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
-				},
-				"minLength": {
-					"type": "integer",
-					"description": "The minimum length (in tokens) of the generated text, including the input."
-				},
-				"minNewTokens": {
-					"type": "integer",
-					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
-				},
-				"doSample": {
-					"type": "boolean",
-					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
-				},
-				"earlyStopping": {
-					"description": "Controls the stopping condition for beam-based methods.",
-					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
-				},
-				"numBeams": {
-					"type": "integer",
-					"description": "Number of beams to use for beam search."
-				},
-				"numBeamGroups": {
-					"type": "integer",
-					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
-				},
-				"penaltyAlpha": {
-					"type": "number",
-					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
-				},
-				"useCache": {
-					"type": "boolean",
-					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
-				}
-			}
-		}
-	}
-}
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
index 87031422c1..6ae6f1c39c 100644
--- a/packages/tasks/src/tasks/text-classification/spec/input.json
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -22,7 +22,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "TextClassificationOutputTransform",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
 				"topK": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
index 95d1ca5ee3..704b82225b 100644
--- a/packages/tasks/src/tasks/text-classification/spec/output.json
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "TextClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
index d049fb02e7..95bd8d16db 100644
--- a/packages/tasks/src/tasks/text-to-audio/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -22,7 +22,7 @@
 			"properties": {
 				"generate": {
 					"description": "Parametrization of the text generation process",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/GenerationParameters"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
 				}
 			}
 		}
diff --git a/packages/tasks/src/tasks/text-to-image/inference.ts b/packages/tasks/src/tasks/text-to-image/inference.ts
index bfe0433f38..c25031b29e 100644
--- a/packages/tasks/src/tasks/text-to-image/inference.ts
+++ b/packages/tasks/src/tasks/text-to-image/inference.ts
@@ -39,6 +39,10 @@ export interface TextToImageParameters {
 	 * a higher quality image at the expense of slower inference.
 	 */
 	numInferenceSteps?: number;
+	/**
+	 * For diffusion models. Override the scheduler with a compatible one
+	 */
+	scheduler?: string;
 	/**
 	 * The size in pixel of the output image
 	 */
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
index 0607bbf522..984670953b 100644
--- a/packages/tasks/src/tasks/video-classification/spec/input.json
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -21,7 +21,7 @@
 			"properties": {
 				"functionToApply": {
 					"title": "TextClassificationOutputTransform",
-					"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutputTransform"
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
 				},
 				"numFrames": {
 					"type": "integer",
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
index 247aae997e..4c24f5d577 100644
--- a/packages/tasks/src/tasks/video-classification/spec/output.json
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "VideoClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
index d9e78c2319..83ed1098fd 100644
--- a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "ZeroShotClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
index 68a5ecfb03..6b795fbdba 100644
--- a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -5,6 +5,6 @@
 	"title": "ZeroShotImageClassificationOutput",
 	"type": "array",
 	"items": {
-		"$ref": "/inference/schemas/schema-utils.json#/definitions/ClassificationOutput"
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }

From 3a3d4ba6318d947e40d73de3469f0e27dafde1e6 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:24:21 +0100
Subject: [PATCH 50/51] proper type for table QA

---
 packages/tasks/src/tasks/table-question-answering/inference.ts | 2 +-
 .../tasks/src/tasks/table-question-answering/spec/input.json   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
index ac04c6a32a..fe06dbbfe5 100644
--- a/packages/tasks/src/tasks/table-question-answering/inference.ts
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -31,7 +31,7 @@ export interface TableQuestionAnsweringInputData {
 	 * The table to serve as context for the questions
 	 */
 	table: {
-		[key: string]: unknown;
+		[key: string]: string[];
 	};
 	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
index e3fc6db9f4..6309cf1f36 100644
--- a/packages/tasks/src/tasks/table-question-answering/spec/input.json
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -12,7 +12,8 @@
 			"properties": {
 				"table": {
 					"description": "The table to serve as context for the questions",
-					"type": "object"
+					"type": "object",
+					"additionalProperties": { "type": "array", "items": { "type": "string" } }
 				},
 				"question": {
 					"description": "The question to be answered about the table",

From 4742c9ed37c44bf413f43474e8804e4ea44417f1 Mon Sep 17 00:00:00 2001
From: SBrandeis <simon@huggingface.co>
Date: Fri, 26 Jan 2024 17:28:13 +0100
Subject: [PATCH 51/51] oops I forgot to commit the new file after rename

---
 .../tasks/src/tasks/common-definitions.json   | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 packages/tasks/src/tasks/common-definitions.json

diff --git a/packages/tasks/src/tasks/common-definitions.json b/packages/tasks/src/tasks/common-definitions.json
new file mode 100644
index 0000000000..6e0ec532d4
--- /dev/null
+++ b/packages/tasks/src/tasks/common-definitions.json
@@ -0,0 +1,109 @@
+{
+	"$id": "/inference/schemas/common-definitions.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "(Incomplete!) Common type definitions shared by several tasks",
+	"definitions": {
+		"ClassificationOutputTransform": {
+			"title": "ClassificationOutputTransform",
+			"type": "string",
+			"description": "The function to apply to the model outputs in order to retrieve the scores.",
+			"oneOf": [
+				{
+					"const": "sigmoid"
+				},
+				{
+					"const": "softmax"
+				},
+				{
+					"const": "none"
+				}
+			]
+		},
+		"ClassificationOutput": {
+			"title": "ClassificationOutput",
+			"type": "object",
+			"properties": {
+				"label": {
+					"type": "string",
+					"description": "The predicted class label."
+				},
+				"score": {
+					"type": "number",
+					"description": "The corresponding probability."
+				}
+			},
+			"required": ["label", "score"]
+		},
+		"GenerationParameters": {
+			"title": "GenerationParameters",
+			"description": "Ad-hoc parametrization of the text generation process",
+			"type": "object",
+			"properties": {
+				"temperature": {
+					"type": "number",
+					"description": "The value used to modulate the next token probabilities."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
+				},
+				"epsilonCutoff": {
+					"type": "number",
+					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"etaCutoff": {
+					"type": "number",
+					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"maxLength": {
+					"type": "integer",
+					"description": "The maximum length (in tokens) of the generated text, including the input."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"minLength": {
+					"type": "integer",
+					"description": "The minimum length (in tokens) of the generated text, including the input."
+				},
+				"minNewTokens": {
+					"type": "integer",
+					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
+				},
+				"earlyStopping": {
+					"description": "Controls the stopping condition for beam-based methods.",
+					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
+				},
+				"numBeams": {
+					"type": "integer",
+					"description": "Number of beams to use for beam search."
+				},
+				"numBeamGroups": {
+					"type": "integer",
+					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
+				},
+				"penaltyAlpha": {
+					"type": "number",
+					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
+				},
+				"useCache": {
+					"type": "boolean",
+					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
+				}
+			}
+		}
+	}
+}