fiftyone.utils.huggingface

module documentation

(source)

Utilities for working with Hugging Face.

Class	`HFHubDatasetConfig`	Config for a Hugging Face Hub dataset.
Class	`HFHubParquetFilesDatasetConfig`	Config for a Hugging Face Hub dataset that is stored as parquet files.
Class	`MediaFieldConverter`	Undocumented
Function	`list_hub_datasets`	Lists all FiftyOne datasets available on the Hugging Face Hub.
Function	`load_from_hub`	Loads a dataset from the Hugging Face Hub into FiftyOne.
Function	`push_to_hub`	Push a FiftyOne dataset to the Hugging Face Hub.
Constant	`DATASET_CONTENT_TEMPLATE`	Undocumented
Constant	`DATASET_METADATA_FILENAMES`	Undocumented
Constant	`DATASETS_MAX_BATCH_SIZE`	Undocumented
Constant	`DATASETS_SERVER_URL`	Undocumented
Constant	`DEFAULT_BATCH_SIZE`	Undocumented
Constant	`DEFAULT_IMAGE_FILEPATH_FEATURE`	Undocumented
Constant	`DEFAULT_MEDIA_TYPE`	Undocumented
Constant	`FIFTYONE_BUILTIN_FIELDS`	Undocumented
Constant	`SUPPORTED_DTYPES`	Undocumented
Variable	`hfh`	Undocumented
Variable	`logger`	Undocumented
Function	`_add_dataset_metadata`	Undocumented
Function	`_add_parquet_subset_to_dataset`	Undocumented
Function	`_build_config`	Undocumented
Function	`_build_dtype_field_converter`	Undocumented
Function	`_build_label_field_converter`	Undocumented
Function	`_build_media_field_converter`	Undocumented
Function	`_build_parquet_to_fiftyone_conversion`	Undocumented
Function	`_build_rows_request_url`	Undocumented
Function	`_configure_dataset_media_fields`	Undocumented
Function	`_convert_bounding_box`	Undocumented
Function	`_count_samples`	Undocumented
Function	`_create_dataset_card`	Undocumented
Function	`_download_files_in_batches`	Undocumented
Function	`_download_image`	Undocumented
Function	`_download_images`	Undocumented
Function	`_ensure_dataset_compatibility`	Undocumented
Function	`_extract_number`	Undocumented
Function	`_generate_dataset_summary`	Undocumented
Function	`_get_allowed_splits`	Undocumented
Function	`_get_allowed_subsets`	Undocumented
Function	`_get_bounding_box_field_name`	Undocumented
Function	`_get_dataset_metadata`	Undocumented
Function	`_get_dataset_tags`	Undocumented
Function	`_get_dataset_tasks`	Undocumented
Function	`_get_detection_label_field_name`	Undocumented
Function	`_get_download_dir`	Undocumented
Function	`_get_files_to_download`	Undocumented
Function	`_get_headers`	Undocumented
Function	`_get_image_shape`	Undocumented
Function	`_get_label_field_names_and_types`	Undocumented
Function	`_get_media_fields`	Undocumented
Function	`_get_num_rows`	Undocumented
Function	`_get_parquet_dataset_features`	Undocumented
Function	`_get_rows`	Undocumented
Function	`_get_size_category`	Undocumented
Function	`_get_split_subset_pairs`	Undocumented
Function	`_is_already_uploaded`	Undocumented
Function	`_is_valid_split_subset_pair`	Undocumented
Function	`_load_dataset_from_config`	Undocumented
Function	`_load_fiftyone_dataset_from_config`	Undocumented
Function	`_load_parquet_files_dataset_from_config`	Undocumented
Function	`_no_progress_bars`	Undocumented
Function	`_parse_format_string`	Undocumented
Function	`_parse_split_kwargs`	Undocumented
Function	`_parse_subset_kwargs`	Undocumented
Function	`_populate_config_file`	Undocumented
Function	`_resolve_dataset_name`	Undocumented
Function	`_upload_data_to_repo`	Undocumented

def list_hub_datasets(info=False): (source) ¶

Lists all FiftyOne datasets available on the Hugging Face Hub.

This method includes all datasets that are tagged to the FiftyOne library in Hugging Face.

Examples:

from fiftyone.utils.huggingface import list_hub_datasets

datasets = list_hub_datasets()
print(datasets)

Parameters
info:`False`	whether to return dataset names (False) or `huggingface_hub.hf_api.DatasetInfo` objects (True)
Returns
a list of dataset names or objects

def load_from_hub(repo_id, revision=None, split=None, splits=None, subset=None, subsets=None, max_samples=None, batch_size=None, num_workers=None, overwrite=False, persistent=False, name=None, token=None, config_file=None, **kwargs): (source) ¶

Loads a dataset from the Hugging Face Hub into FiftyOne.

Parameters
repo_id	the Hugging Face Hub identifier of the dataset
revision:`None`	the revision of the dataset to load
split:`None`	the split of the dataset to load
splits:`None`	the splits of the dataset to load
subset:`None`	the subset of the dataset to load
subsets:`None`	the subsets of the dataset to load
max_samples:`None`	the maximum number of samples to load
batch_size:`None`	the batch size to use when loading samples
num_workers:`None`	a suggested number of threads to use when downloading media
overwrite:`True`	whether to overwrite an existing dataset with the same name
persistent:`False`	whether the dataset should be persistent
name:`None`	an optional name to give the dataset
token:`None`	a Hugging Face API token to use. May also be provided via the `HF_TOKEN` environment variable
config_file:`None`	the path to a config file on disk specifying how to load the dataset if the repo has no `fiftyone.yml` file
**kwargs	keyword arguments specifying config parameters to load the dataset if the repo has no `fiftyone.yml` file
Returns
a `fiftyone.core.dataset.Dataset`

def push_to_hub(dataset, repo_name, description=None, license=None, tags=None, private=False, exist_ok=False, dataset_type=None, min_fiftyone_version=None, label_field=None, frame_labels_field=None, token=None, preview_path=None, chunk_size=None, **data_card_kwargs): (source) ¶

Push a FiftyOne dataset to the Hugging Face Hub.

Parameters
dataset	a FiftyOne dataset
repo_name	the name of the dataset repo to create. The repo ID will be `{your_username}/{repo_name}`
description:`None`	a description of the dataset
license:`None`	the license of the dataset
tags:`None`	a list of tags for the dataset
private:`True`	whether the repo should be private
exist_ok:`False`	if True, do not raise an error if repo already exists.
dataset_type:`None`	the type of the dataset to create
min_fiftyone_version:`None`	the minimum version of FiftyOne required to load the dataset. For example `"0.23.0"`.
label_field:`None`	controls the label field(s) to export. Only applicable to labeled datasets. Can be any of the following: the name of a label field to export a glob pattern of label field(s) to export a list or tuple of label field(s) to export a dictionary mapping label field names to keys to use when constructing the label dictionaries to pass to the exporter
frame_labels_field:`None`	controls the frame label field(s) to export. The "frames." prefix is optional. Only applicable to labeled video datasets. Can be any of the following: the name of a frame label field to export a glob pattern of frame label field(s) to export a list or tuple of frame label field(s) to export a dictionary mapping frame label field names to keys to use when constructing the frame label dictionaries to pass to the exporter
token:`None`	a Hugging Face API token to use. May also be provided via the `HF_TOKEN` environment variable
preview_path:`None`	a path to a preview image or video to display on the readme of the dataset repo.
chunk_size:`None`	the number of media files to put in each subdirectory, to avoid having too many files in a single directory. If None, no chunking is performed. If the dataset has more than 10,000 samples, it will be chunked by default to avoid exceeding the maximum number of files in a directory on Hugging Face Hub. This parameter is only applicable to `fiftyone.types.dataset_types.FiftyOneDataset` datasets.
**data_card_kwargs	additional keyword arguments to pass to the `DatasetCard` constructor

DATASET_CONTENT_TEMPLATE: str = (source) ¶

Undocumented

Value

'''

{preview}

This is a [FiftyOne](https://github.com/voxel51/fiftyone) dataset with {num_samp↵
les} samples.

...

DATASET_METADATA_FILENAMES: tuple[str, ...] = (source) ¶

Undocumented

Value

('fiftyone.yml', 'fiftyone.yaml')

DATASETS_MAX_BATCH_SIZE: int = (source) ¶

Undocumented

Value

DATASETS_SERVER_URL: str = (source) ¶

Undocumented

Value

'https://datasets-server.huggingface.co'

DEFAULT_BATCH_SIZE: int = (source) ¶

Undocumented

Value

DEFAULT_IMAGE_FILEPATH_FEATURE: str = (source) ¶

Undocumented

Value

'image'

DEFAULT_MEDIA_TYPE: str = (source) ¶

Undocumented

Value

'image'

FIFTYONE_BUILTIN_FIELDS: tuple[str, ...] = (source) ¶

Undocumented

Value

('id', 'filepath', 'tags', 'metadata')

SUPPORTED_DTYPES: tuple[str, ...] = (source) ¶

Undocumented

Value

('int8',
 'int16',
 'int32',
 'int64',
 'float16',
 'float32',
 'float64',
...

hfh = (source) ¶

Undocumented

logger = (source) ¶

Undocumented

def _add_dataset_metadata(dataset, config): (source) ¶

Undocumented

def _add_parquet_subset_to_dataset(dataset, config, split, subset, **kwargs): (source) ¶

Undocumented

def _build_config(config_dict): (source) ¶

Undocumented

def _build_dtype_field_converter(field_name, feature, config): (source) ¶

Undocumented

def _build_label_field_converter(field_name, field_type, feature, download_dir): (source) ¶

Undocumented

def _build_media_field_converter(media_field_key, media_field_name, feature, download_dir): (source) ¶

Undocumented

def _build_parquet_to_fiftyone_conversion(config, split, subset, **kwargs): (source) ¶

Undocumented

def _build_rows_request_url(repo_id, split=None, subset='default', revision=None, offset=0, length=100): (source) ¶

Undocumented

def _configure_dataset_media_fields(dataset, config): (source) ¶

Undocumented

def _convert_bounding_box(hf_bbox, img_size): (source) ¶

Undocumented

def _count_samples(sample_collection): (source) ¶

Undocumented

def _create_dataset_card(repo_id, dataset, tags=None, license=None, preview_path=None, **dataset_card_kwargs): (source) ¶

Undocumented

def _download_files_in_batches(filepaths, download_dir, batch_size, **init_download_kwargs): (source) ¶

Undocumented

def _download_image(url_and_filepath): (source) ¶

Undocumented

def _download_images(urls_and_filepaths, num_workers): (source) ¶

Undocumented

def _ensure_dataset_compatibility(config): (source) ¶

Undocumented

def _extract_number(filename): (source) ¶

Undocumented

def _generate_dataset_summary(repo_id, dataset, preview_path): (source) ¶

Undocumented

def _get_allowed_splits(config, **kwargs): (source) ¶

Undocumented

def _get_allowed_subsets(config, **kwargs): (source) ¶

Undocumented

def _get_bounding_box_field_name(feature): (source) ¶

Undocumented

def _get_dataset_metadata(repo_id, revision=None, token=None, **kwargs): (source) ¶

Undocumented

def _get_dataset_tags(dataset): (source) ¶

Undocumented

def _get_dataset_tasks(dataset): (source) ¶

Undocumented

def _get_detection_label_field_name(feature): (source) ¶

Undocumented

def _get_download_dir(repo_id, split=None, subset=None, **kwargs): (source) ¶

Undocumented

def _get_files_to_download(sample_collection): (source) ¶

Undocumented

def _get_headers(**kwargs): (source) ¶

Undocumented

def _get_image_shape(image_path): (source) ¶

Undocumented

def _get_label_field_names_and_types(config): (source) ¶

Undocumented

def _get_media_fields(sample_collection): (source) ¶

Undocumented

def _get_num_rows(repo_id, split, subset, revision=None, **kwargs): (source) ¶

Undocumented

def _get_parquet_dataset_features(repo_id, split, subset, revision=None, **kwargs): (source) ¶

Undocumented

def _get_rows(repo_id, split, subset, start_index=0, end_index=100, revision=None, **kwargs): (source) ¶

Undocumented

def _get_size_category(num_samples): (source) ¶

Undocumented

def _get_split_subset_pairs(config, **kwargs): (source) ¶

Undocumented

def _is_already_uploaded(api, repo_id, folder_path): (source) ¶

Undocumented

def _is_valid_split_subset_pair(split, subset, allowed_splits, allowed_subsets): (source) ¶

Undocumented

def _load_dataset_from_config(config, **kwargs): (source) ¶

Undocumented

def _load_fiftyone_dataset_from_config(config, **kwargs): (source) ¶

Undocumented

def _load_parquet_files_dataset_from_config(config, **kwargs): (source) ¶

Undocumented

@contextmanager
def _no_progress_bars(): (source) ¶

Undocumented

def _parse_format_string(format_str): (source) ¶

Undocumented

def _parse_split_kwargs(**kwargs): (source) ¶

Undocumented

def _parse_subset_kwargs(**kwargs): (source) ¶

Undocumented

def _populate_config_file(config_filepath, dataset, dataset_type=None, description=None, license=None, tags=None, min_fiftyone_version=None): (source) ¶

Undocumented

def _resolve_dataset_name(config, **kwargs): (source) ¶

Undocumented

def _upload_data_to_repo(api, repo_id, tmp_dir, dataset_type): (source) ¶

Undocumented