diff --git a/promptsource/app.py b/promptsource/app.py index cc7a31d52..8ca9d8e1d 100644 --- a/promptsource/app.py +++ b/promptsource/app.py @@ -313,13 +313,18 @@ def show_text(t, width=WIDTH, with_markdown=False): except OSError as e: st.error( f"Some datasets are not handled automatically by `datasets` and require users to download the " - f"dataset manually. This applies to {dataset_key}{f'/{subset_name}' if subset_name is not None else ''}. " - f"\n\nPlease download the raw dataset to `~/.cache/promptsource/{dataset_key}{f'/{subset_name}' if subset_name is not None else ''}`. " + f"dataset manually. It is possibly the case for {dataset_key}{f'/{subset_name}' if subset_name is not None else ''}. " + f"\n\nIf so, please download the raw dataset to `~/.cache/promptsource/{dataset_key}{f'/{subset_name}' if subset_name is not None else ''}`. " f"\n\nYou can choose another cache directory by overriding `PROMPTSOURCE_MANUAL_DATASET_DIR` environment " f"variable and downloading raw dataset to `$PROMPTSOURCE_MANUAL_DATASET_DIR/{dataset_key}{f'/{subset_name}' if subset_name is not None else ''}`" f"\n\nOriginal error:\n{str(e)}" ) st.stop() + except Exception as e: + st.error( + f"An error occured while loading the dataset {dataset_key}{f'/{subset_name}' if subset_name is not None else ''}. " + f"\\n\nOriginal error:\n{str(e)}" + ) splits = list(dataset.keys()) index = 0 diff --git a/promptsource/utils.py b/promptsource/utils.py index ce57a2ac1..c8ae32bde 100644 --- a/promptsource/utils.py +++ b/promptsource/utils.py @@ -46,33 +46,25 @@ def get_dataset_builder(path, conf=None): def get_dataset(path, conf=None): "Get a dataset from name and conf." - builder_instance = get_dataset_builder(path, conf) - if builder_instance.manual_download_instructions is None and builder_instance.info.size_in_bytes is not None: - builder_instance.download_and_prepare() - return builder_instance.as_dataset() - else: - return load_dataset(path, conf) - - -def load_dataset(dataset_name, subset_name): try: - return datasets.load_dataset(dataset_name, subset_name) + return datasets.load_dataset(path, conf) except datasets.builder.ManualDownloadError: cache_root_dir = ( os.environ["PROMPTSOURCE_MANUAL_DATASET_DIR"] if "PROMPTSOURCE_MANUAL_DATASET_DIR" in os.environ else DEFAULT_PROMPTSOURCE_CACHE_HOME ) - data_dir = ( - f"{cache_root_dir}/{dataset_name}" - if subset_name is None - else f"{cache_root_dir}/{dataset_name}/{subset_name}" - ) - return datasets.load_dataset( - dataset_name, - subset_name, - data_dir=data_dir, - ) + data_dir = f"{cache_root_dir}/{path}" if conf is None else f"{cache_root_dir}/{path}/{conf}" + try: + return datasets.load_dataset( + path, + conf, + data_dir=data_dir, + ) + except Exception as err: + raise err + except Exception as err: + raise err def get_dataset_confs(path):