diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index 9256f2381484..e38eeb20088b 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -7,11 +7,10 @@
 from ..models.bert.tokenization_bert import BasicTokenizer
 from ..utils import (
     ExplicitEnum,
-    add_end_docstrings,
     is_tf_available,
     is_torch_available,
 )
-from .base import ArgumentHandler, ChunkPipeline, Dataset, build_pipeline_init_args
+from .base import ArgumentHandler, ChunkPipeline, Dataset
 
 
 if is_tf_available():
@@ -60,40 +59,6 @@ class AggregationStrategy(ExplicitEnum):
     MAX = "max"
 
 
-@add_end_docstrings(
-    build_pipeline_init_args(has_tokenizer=True),
-    r"""
-        ignore_labels (`List[str]`, defaults to `["O"]`):
-            A list of labels to ignore.
-        grouped_entities (`bool`, *optional*, defaults to `False`):
-            DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
-            same entity together in the predictions or not.
-        stride (`int`, *optional*):
-            If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
-            model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
-            value of this argument defines the number of overlapping tokens between chunks. In other words, the model
-            will shift forward by `tokenizer.model_max_length - stride` tokens each step.
-        aggregation_strategy (`str`, *optional*, defaults to `"none"`):
-            The strategy to fuse (or not) tokens based on the model prediction.
-
-                - "none" : Will simply not do any aggregation and simply return raw results from the model
-                - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
-                  I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
-                  "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
-                  different entities. On word based languages, we might end up splitting words undesirably : Imagine
-                  Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
-                  "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
-                  that support that meaning, which is basically tokens separated by a space). These mitigations will
-                  only work on real words, "New york" might still be tagged with two different entities.
-                - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
-                  end up with different tags. Words will simply use the tag of the first token of the word when there
-                  is ambiguity.
-                - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
-                  cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
-                  label is applied.
-                - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
-                  end up with different tags. Word entity will simply be the token with the maximum score.""",
-)
 class TokenClassificationPipeline(ChunkPipeline):
     """
     Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
@@ -224,6 +189,33 @@ def __call__(self, inputs: Union[str, List[str]], **kwargs):
         Args:
             inputs (`str` or `List[str]`):
                 One or several texts (or one list of texts) for token classification.
+            ignore_labels (`List[str]`, defaults to `["O"]`):
+                A list of labels to ignore.
+            stride (`int`, *optional*):
+                If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
+                model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
+                value of this argument defines the number of overlapping tokens between chunks. In other words, the model
+                will shift forward by `tokenizer.model_max_length - stride` tokens each step.
+            aggregation_strategy (`str`, *optional*, defaults to `"none"`):
+                The strategy to fuse (or not) tokens based on the model prediction.
+
+                    - "none" : Will simply not do any aggregation and simply return raw results from the model
+                    - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
+                      I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
+                      "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
+                      different entities. On word based languages, we might end up splitting words undesirably : Imagine
+                      Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
+                      "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
+                      that support that meaning, which is basically tokens separated by a space). These mitigations will
+                      only work on real words, "New york" might still be tagged with two different entities.
+                    - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+                      end up with different tags. Words will simply use the tag of the first token of the word when there
+                      is ambiguity.
+                    - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
+                      cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
+                      label is applied.
+                    - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+                      end up with different tags. Word entity will simply be the token with the maximum score.
 
         Return:
             A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index 5e4b18f91699..2ee998b24ffb 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from huggingface_hub import TokenClassificationOutputElement
 
 from transformers import (
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
@@ -26,6 +27,7 @@
 )
 from transformers.pipelines import AggregationStrategy, TokenClassificationArgumentHandler
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     is_torch_available,
     nested_simplify,
@@ -103,6 +105,9 @@ def run_pipeline_test(self, token_classifier, _):
                 for i in range(n)
             ],
         )
+        for output_element in nested_simplify(outputs):
+            compare_pipeline_output_to_hub_spec(output_element, TokenClassificationOutputElement)
+
         outputs = token_classifier(["list of strings", "A simple string that is quite a bit longer"])
         self.assertIsInstance(outputs, list)
         self.assertEqual(len(outputs), 2)
@@ -137,6 +142,9 @@ def run_pipeline_test(self, token_classifier, _):
             ],
         )
 
+        for output_element in nested_simplify(outputs):
+            compare_pipeline_output_to_hub_spec(output_element, TokenClassificationOutputElement)
+
         self.run_aggregation_strategy(model, tokenizer)
 
     def run_aggregation_strategy(self, model, tokenizer):
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 74bc1b8669a7..a6cacc9ce269 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -34,6 +34,7 @@
     ImageToTextInput,
     ObjectDetectionInput,
     QuestionAnsweringInput,
+    TokenClassificationInput,
     ZeroShotImageClassificationInput,
 )
 
@@ -47,6 +48,7 @@
     ImageToTextPipeline,
     ObjectDetectionPipeline,
     QuestionAnsweringPipeline,
+    TokenClassificationPipeline,
     ZeroShotImageClassificationPipeline,
 )
 from transformers.testing_utils import (
@@ -132,6 +134,7 @@
     "image-to-text": (ImageToTextPipeline, ImageToTextInput),
     "object-detection": (ObjectDetectionPipeline, ObjectDetectionInput),
     "question-answering": (QuestionAnsweringPipeline, QuestionAnsweringInput),
+    "token-classification": (TokenClassificationPipeline, TokenClassificationInput),
     "zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput),
 }