bigscience-workshop · stephenbach · Jul 12, 2022 · May 22, 2022 · Jul 3, 2022 · Jul 3, 2022
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -29,7 +29,7 @@ You can always update the name later. If you want to cancel the prompt, select
 1. **Write the prompt**. In the box labeled "Template," enter a Jinja expression.
 See the [getting started guide](#getting-started-using-jinja-to-write-prompts)
 and [cookbook](#jinja-cookbook) for details on how to write templates.
-1. **Fill in metadata**. Fill in the metadata for the current prompt: reference, original task, choices in templates, and answer choices.
+1. **Fill in metadata**. Fill in the metadata for the current prompt: reference, original task, choices in templates, metrics, languages, and answer choices.
 See [Metadata](#metadata) for more details about these fields.
 1. **Save the prompt**. Hit the "Save" button. The output of the prompt
 applied to the current example will appear in the right sidebar.
@@ -124,6 +124,7 @@ to generate a question for a given answer would not.
 the options for the possible outputs (regardless of whether `answer_choices` is used).
 * **Metrics.** Use the multiselect widget to select all metrics commonly used to evaluate
 this task. Choose “Other” if there is one that is not included in the list.
+* **Languages.** Use the multiselect widget to select all languages used in the prompt. This is independent of what languages are used in the underlying dataset. For example, you could have an English prompt for a Spanish dataset.
 * **Answer Choices.**  If the prompt has a small set of possible outputs (e.g., Yes/No,
 class labels, entailment judgements, etc.), then the prompt should define and use answer
 choices as follows. This allows evaluation to consider just the possible targets for

diff --git a/assets/promptsource_app.png b/assets/promptsource_app.png
diff --git a/promptsource/app.py b/promptsource/app.py
@@ -17,7 +17,7 @@
 
 from promptsource import DEFAULT_PROMPTSOURCE_CACHE_HOME
 from promptsource.session import _get_state
-from promptsource.templates import INCLUDED_USERS, DatasetTemplates, Template, TemplateCollection
+from promptsource.templates import INCLUDED_USERS, LANGUAGES, METRICS, DatasetTemplates, Template, TemplateCollection
 from promptsource.utils import (
     get_dataset,
     get_dataset_confs,
@@ -57,6 +57,17 @@ def get_infos(all_infos, d_name):
     all_infos[d_name] = infos_dict
 
 
+def format_language(tag):
+    """
+    Formats a language tag for display in the UI.
+
+    For example, if the tag is "en", then the function returns "en (English)"
+    :param tag: language tag
+    :return: formatted language name
+    """
+    return tag + " (" + LANGUAGES[tag] + ")"
+
+
 # add an argument for read-only
 # At the moment, streamlit does not handle python script arguments gracefully.
 # Thus, for read-only mode, you have to type one of the below two:
@@ -421,6 +432,11 @@ def show_text(t, width=WIDTH, with_markdown=False):
                     st.text(template.metadata.choices_in_prompt)
                     st.markdown("##### Metrics")
                     st.text(", ".join(template.metadata.metrics) if template.metadata.metrics else None)
+                    st.markdown("##### Prompt Languages")
+                    if template.metadata.languages:
+                        st.text(", ".join([format_language(tag) for tag in template.metadata.languages]))
+                    else:
+                        st.text(None)
                     st.markdown("##### Answer Choices")
                     if template.get_answer_choices_expr() is not None:
                         show_jinja(template.get_answer_choices_expr())
@@ -559,35 +575,24 @@ def show_text(t, width=WIDTH, with_markdown=False):
                                 help="Prompt explicitly lists choices in the template for the output.",
                             )
 
-                            # Metrics from here:
-                            # https://github.com/google-research/text-to-text-transfer-transformer/blob/4b580f23968c2139be7fb1cd53b22c7a7f686cdf/t5/evaluation/metrics.py
-                            metrics_choices = [
-                                "BLEU",
-                                "ROUGE",
-                                "Squad",
-                                "Trivia QA",
-                                "Accuracy",
-                                "Pearson Correlation",
-                                "Spearman Correlation",
-                                "MultiRC",
-                                "AUC",
-                                "COQA F1",
-                                "Edit Distance",
-                            ]
-                            # Add mean reciprocal rank
-                            metrics_choices.append("Mean Reciprocal Rank")
-                            # Add generic other
-                            metrics_choices.append("Other")
-                            # Sort alphabetically
-                            metrics_choices = sorted(metrics_choices)
                             state.metadata.metrics = st.multiselect(
                                 "Metrics",
-                                metrics_choices,
+                                sorted(METRICS),
                                 default=template.metadata.metrics,
                                 help="Select all metrics that are commonly used (or should "
                                 "be used if a new task) to evaluate this prompt.",
                             )
 
+                            state.metadata.languages = st.multiselect(
+                                "Prompt Languages",
+                                sorted(LANGUAGES.keys()),
+                                default=template.metadata.languages,
+                                format_func=format_language,
+                                help="Select all languages used in this prompt. "
+                                "This annotation is independent from the language(s) "
+                                "of the dataset.",
+                            )
+
                             # Answer choices
                             if template.get_answer_choices_expr() is not None:
                                 answer_choices = template.get_answer_choices_expr()

diff --git a/promptsource/templates.py b/promptsource/templates.py
@@ -29,6 +29,212 @@
 
 INCLUDED_USERS = {"Zaid", "craffel", "GEM", "aps", "khalidalt", "shanya", "rbawden", "BigScienceBiasEval", "gsarti"}
 
+# These are the metrics with which templates can be tagged
+METRICS = {
+    "BLEU",
+    "ROUGE",
+    "Squad",
+    "Trivia QA",
+    "Accuracy",
+    "Pearson Correlation",
+    "Spearman Correlation",
+    "MultiRC",
+    "AUC",
+    "COQA F1",
+    "Edit Distance",
+    "Mean Reciprocal Rank",
+    "Other",
+}
+
+# These are the languages with which templates can be tagged. Keys are ISO 639-1
+# tags, which are the actual tags we use. Values are English names shown in the
+# UI for convenience.
+LANGUAGES = {
+    "ab": "Abkhazian",
+    "aa": "Afar",
+    "af": "Afrikaans",
+    "ak": "Akan",
+    "sq": "Albanian",
+    "am": "Amharic",
+    "ar": "Arabic",
+    "an": "Aragonese",
+    "hy": "Armenian",
+    "as": "Assamese",
+    "av": "Avaric",
+    "ae": "Avestan",
+    "ay": "Aymara",
+    "az": "Azerbaijani",
+    "bm": "Bambara",
+    "ba": "Bashkir",
+    "eu": "Basque",
+    "be": "Belarusian",
+    "bn": "Bengali",
+    "bi": "Bislama",
+    "bs": "Bosnian",
+    "br": "Breton",
+    "bg": "Bulgarian",
+    "my": "Burmese",
+    "ca": "Catalan, Valencian",
+    "ch": "Chamorro",
+    "ce": "Chechen",
+    "ny": "Chichewa, Chewa, Nyanja",
+    "zh": "Chinese",
+    "cu": "Church Slavic, Old Slavonic, Church Slavonic, Old Bulgarian, Old Church Slavonic",
+    "cv": "Chuvash",
+    "kw": "Cornish",
+    "co": "Corsican",
+    "cr": "Cree",
+    "hr": "Croatian",
+    "cs": "Czech",
+    "da": "Danish",
+    "dv": "Divehi, Dhivehi, Maldivian",
+    "nl": "Dutch, Flemish",
+    "dz": "Dzongkha",
+    "en": "English",
+    "eo": "Esperanto",
+    "et": "Estonian",
+    "ee": "Ewe",
+    "fo": "Faroese",
+    "fj": "Fijian",
+    "fi": "Finnish",
+    "fr": "French",
+    "fy": "Western Frisian",
+    "ff": "Fulah",
+    "gd": "Gaelic, Scottish Gaelic",
+    "gl": "Galician",
+    "lg": "Ganda",
+    "ka": "Georgian",
+    "de": "German",
+    "el": "Greek, Modern (1453–)",
+    "kl": "Kalaallisut, Greenlandic",
+    "gn": "Guarani",
+    "gu": "Gujarati",
+    "ht": "Haitian, Haitian Creole",
+    "ha": "Hausa",
+    "he": "Hebrew",
+    "hz": "Herero",
+    "hi": "Hindi",
+    "ho": "Hiri Motu",
+    "hu": "Hungarian",
+    "is": "Icelandic",
+    "io": "Ido",
+    "ig": "Igbo",
+    "id": "Indonesian",
+    "ia": "Interlingua (International Auxiliary Language Association)",
+    "ie": "Interlingue, Occidental",
+    "iu": "Inuktitut",
+    "ik": "Inupiaq",
+    "ga": "Irish",
+    "it": "Italian",
+    "ja": "Japanese",
+    "jv": "Javanese",
+    "kn": "Kannada",
+    "kr": "Kanuri",
+    "ks": "Kashmiri",
+    "kk": "Kazakh",
+    "km": "Central Khmer",
+    "ki": "Kikuyu, Gikuyu",
+    "rw": "Kinyarwanda",
+    "ky": "Kirghiz, Kyrgyz",
+    "kv": "Komi",
+    "kg": "Kongo",
+    "ko": "Korean",
+    "kj": "Kuanyama, Kwanyama",
+    "ku": "Kurdish",
+    "lo": "Lao",
+    "la": "Latin",
+    "lv": "Latvian",
+    "li": "Limburgan, Limburger, Limburgish",
+    "ln": "Lingala",
+    "lt": "Lithuanian",
+    "lu": "Luba-Katanga",
+    "lb": "Luxembourgish, Letzeburgesch",
+    "mk": "Macedonian",
+    "mg": "Malagasy",
+    "ms": "Malay",
+    "ml": "Malayalam",
+    "mt": "Maltese",
+    "gv": "Manx",
+    "mi": "Maori",
+    "mr": "Marathi",
+    "mh": "Marshallese",
+    "mn": "Mongolian",
+    "na": "Nauru",
+    "nv": "Navajo, Navaho",
+    "nd": "North Ndebele",
+    "nr": "South Ndebele",
+    "ng": "Ndonga",
+    "ne": "Nepali",
+    "no": "Norwegian",
+    "nb": "Norwegian Bokmål",
+    "nn": "Norwegian Nynorsk",
+    "ii": "Sichuan Yi, Nuosu",
+    "oc": "Occitan",
+    "oj": "Ojibwa",
+    "or": "Oriya",
+    "om": "Oromo",
+    "os": "Ossetian, Ossetic",
+    "pi": "Pali",
+    "ps": "Pashto, Pushto",
+    "fa": "Persian",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "pa": "Punjabi, Panjabi",
+    "qu": "Quechua",
+    "ro": "Romanian, Moldavian, Moldovan",
+    "rm": "Romansh",
+    "rn": "Rundi",
+    "ru": "Russian",
+    "se": "Northern Sami",
+    "sm": "Samoan",
+    "sg": "Sango",
+    "sa": "Sanskrit",
+    "sc": "Sardinian",
+    "sr": "Serbian",
+    "sn": "Shona",
+    "sd": "Sindhi",
+    "si": "Sinhala, Sinhalese",
+    "sk": "Slovak",
+    "sl": "Slovenian",
+    "so": "Somali",
+    "st": "Southern Sotho",
+    "es": "Spanish, Castilian",
+    "su": "Sundanese",
+    "sw": "Swahili",
+    "ss": "Swati",
+    "sv": "Swedish",
+    "tl": "Tagalog",
+    "ty": "Tahitian",
+    "tg": "Tajik",
+    "ta": "Tamil",
+    "tt": "Tatar",
+    "te": "Telugu",
+    "th": "Thai",
+    "bo": "Tibetan",
+    "ti": "Tigrinya",
+    "to": "Tonga (Tonga Islands)",
+    "ts": "Tsonga",
+    "tn": "Tswana",
+    "tr": "Turkish",
+    "tk": "Turkmen",
+    "tw": "Twi",
+    "ug": "Uighur, Uyghur",
+    "uk": "Ukrainian",
+    "ur": "Urdu",
+    "uz": "Uzbek",
+    "ve": "Venda",
+    "vi": "Vietnamese",
+    "vo": "Volapük",
+    "wa": "Walloon",
+    "cy": "Welsh",
+    "wo": "Wolof",
+    "xh": "Xhosa",
+    "yi": "Yiddish",
+    "yo": "Yoruba",
+    "za": "Zhuang, Chuang",
+    "zu": "Zulu",
+}
+
 
 def highlight(input):
     return "<span style='color: #F08080'>" + input + "</span>"
@@ -229,6 +435,7 @@ def __init__(
             original_task: Optional[bool] = None,
             choices_in_prompt: Optional[bool] = None,
             metrics: Optional[List[str]] = None,
+            languages: Optional[List[str]] = None,
         ):
             """
             Initializes template metadata.
@@ -242,10 +449,12 @@ def __init__(
             :param choices_in_prompt: If True, the answer choices are included in the templates such that models
                 see those choices in the input. Only applicable to classification tasks.
             :param metrics: List of strings denoting metrics to use for evaluation
+            :param metrics: List of strings denoting languages used in the prompt (not the associated dataset!)
             """
             self.original_task = original_task
             self.choices_in_prompt = choices_in_prompt
             self.metrics = metrics
+            self.languages = languages
 
 
 class TemplateCollection:
@@ -505,6 +714,7 @@ def get_templates_data_frame():
         "original_task": [],
         "choices_in_prompt": [],
         "metrics": [],
+        "languages": [],
         "answer_choices": [],
         "jinja": [],
     }
@@ -523,6 +733,7 @@ def get_templates_data_frame():
             data["original_task"].append(template.metadata.original_task)
             data["choices_in_prompt"].append(template.metadata.choices_in_prompt)
             data["metrics"].append(template.metadata.metrics)
+            data["languages"].append(template.metadata.languages)
             data["answer_choices"].append(template.get_answer_choices_expr())
             data["jinja"].append(template.jinja)