Version of Python¶
In [1]:
!python -V
Python 3.12.6
Import Required Packages¶
In [2]:
# Suppress warnings
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)
import io
import os
import numpy as np
import torch
import torch.nn as nn
import torchvision as tv
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
import pandas as pd
import datasets
import PIL
import jupyterlab as jlab
import ipywidgets
import time
from pathlib import Path
from dataclasses import dataclass
from datasets import load_dataset_builder, load_dataset, get_dataset_split_names, get_dataset_config_names, Dataset, Image, DatasetInfo
from torchvision.transforms import v2
from PIL import Image as PILImage
Versions of Required Libraries¶
In [3]:
packages = [
"Torch", "TorchVision", "NumPy", "Pandas", "Datasets", "Pillow", "JupyterLab", "IPyWidgets"
]
package_objects = [
torch, tv, np, pd, datasets, PIL, jlab, ipywidgets
]
versions = list(map(lambda obj: obj.__version__, package_objects))
pkgs = {"Package": packages, "Version": versions}
df_pkgs = pd.DataFrame(data = pkgs)
df_pkgs.index.name = "#"
df_pkgs.index += 1
display(df_pkgs)
path_to_reqs = "."
reqs_name = "requirements.txt"
def get_packages_and_versions():
"""Generate strings with libraries and their versions in the format: package==version"""
for package, version in zip(packages, versions):
yield f"{package.lower()}=={version}\n"
with open(os.path.join(path_to_reqs, reqs_name), "w", encoding = "utf-8") as f:
f.writelines(get_packages_and_versions())
Package | Version | |
---|---|---|
# | ||
1 | Torch | 2.2.2 |
2 | TorchVision | 0.17.2 |
3 | NumPy | 1.26.4 |
4 | Pandas | 2.2.3 |
5 | Datasets | 3.0.0 |
6 | Pillow | 10.4.0 |
7 | JupyterLab | 4.2.5 |
8 | IPyWidgets | 8.1.5 |
Downloading a Dataset from the Hugging Face Hub¶
Before downloading a dataset, you can get general information stored in DatasetInfo, including description, functions, and dataset size
Checks Attributes of a Dataset without Downloading it¶
In [4]:
cache_dir = os.path.join(".", "data") # Directory for downloading the dataset
# Check the general information required to build the dataset without downloading the dataset itself
ds_builder = load_dataset_builder(path = "uoft-cs/cifar10", cache_dir = cache_dir, trust_remote_code = True)
# Information about the dataset
print(ds_builder.info, "\n")
print(ds_builder.info.features)
DatasetInfo(description='', citation='', homepage='', license='', features={'img': Image(mode=None, decode=True, id=None), 'label': ClassLabel(names=['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'], id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='cifar10', config_name='plain_text', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=113861310, num_examples=50000, shard_lengths=None, dataset_name='cifar10'), 'test': SplitInfo(name='test', num_bytes=22774180, num_examples=10000, shard_lengths=None, dataset_name='cifar10')}, download_checksums={'hf://datasets/uoft-cs/cifar10@0b2714987fa478483af9968de7c934580d0bb9a2/plain_text/train-00000-of-00001.parquet': {'num_bytes': 119705255, 'checksum': None}, 'hf://datasets/uoft-cs/cifar10@0b2714987fa478483af9968de7c934580d0bb9a2/plain_text/test-00000-of-00001.parquet': {'num_bytes': 23940850, 'checksum': None}}, download_size=143646105, post_processing_size=None, dataset_size=136635490, size_in_bytes=280281595) {'img': Image(mode=None, decode=True, id=None), 'label': ClassLabel(names=['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'], id=None)}
List of Possible Configurations Available for the Dataset¶
In [5]:
configs = get_dataset_config_names(path = "uoft-cs/cifar10", trust_remote_code = True)
print(configs)
['plain_text']
Available Data Subsets¶
In [6]:
dataset_split_names = get_dataset_split_names(path = "uoft-cs/cifar10", trust_remote_code = True)
print(dataset_split_names)
['train', 'test']
Downloading a Dataset¶
In [7]:
dataset_train, dataset_test = load_dataset(
path = "uoft-cs/cifar10",
name = "plain_text", # see configs
split = dataset_split_names,
cache_dir = cache_dir,
trust_remote_code = True
)
print("Train subset:\n", dataset_train, "\n")
print("Test subset:\n", dataset_test)
Train subset: Dataset({ features: ['img', 'label'], num_rows: 50000 }) Test subset: Dataset({ features: ['img', 'label'], num_rows: 10000 })
Indexing a Dataset¶
Dataset contains columns of different data types
In [8]:
# Getting the first row of data
print(dataset_train[0], "\n")
# Getting the last row of data
print(dataset_train[-1], "\n")
# Get a subset of rows of data
print(dataset_train[3:7], "\n")
start_time = time.time()
# Get a part of the rows from a specified column of a data
print(dataset_train[:10]["img"])
end_time = time.time()
print(f"Runtime: {end_time - start_time:.4f} seconds")
{'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x135482CC0>, 'label': 0} {'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x135481E20>, 'label': 5} {'img': [<PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3EFC0>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3F230>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FA70>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FE00>], 'label': [2, 7, 2, 1]} [<PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FEF0>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FF50>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FF80>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64080>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64050>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC640B0>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64140>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64200>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64170>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC641D0>] Runtime: 0.0024 seconds
Apply Transformations to Images¶
In [9]:
compose = v2.Compose([
v2.RandomResizedCrop(size = (64, 64), antialias = True),
v2.RandomRotation(degrees = (0, 90))
])
def transforms(examples: dict):
# Apply transformations to all images (img) and save the results (pixel_values)
examples["pixel_values"] = list(map(compose, examples["img"]))
return examples # Result
# Applying the transform function to the training subset
dataset_train.set_transform(transforms)
# Getting a row
print(dataset_train[1])
dataset_train[1]["pixel_values"]
{'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC680E0>, 'label': 6, 'pixel_values': <PIL.Image.Image image mode=RGB size=64x64 at 0x13DC68260>}
Out[9]:
Creating a Dataset¶
The dataset is partitioned based on the root directory structure, and labels are automatically created from the names of the subdirectories
$$ dataset/{train|test}/{class}/{filename} $$
- ${train|test}$ — training, test subset
- ${class}$ — class
- ${filename}$ — image or audio file
Directory-based¶
In [10]:
data_dir = os.path.join(".", "data", "pokemon") # Directory for downloading the dataset
dataset = load_dataset(
path = "imagefolder",
data_dir = data_dir
)
dataset_train, dataset_test = dataset['train'], dataset['test']
compose = v2.Compose([
v2.Resize(size = (128, 128), antialias = True)
])
def transforms(examples: dict):
# Apply transformations to all images (img) and save the results (pixel_values)
examples["pixel_values"] = list(map(lambda image: F.rotate(compose(image), angle = 45), examples["image"]))
return examples # Result
# Applying the transform function to the training subset
dataset_train.set_transform(transforms)
print("Train subset:\n", dataset_train, "\n")
print("Test subset:\n", dataset_test, "\n")
# Getting the first row of data
print(dataset_train[0])
dataset_train[0]["pixel_values"]
Train subset: Dataset({ features: ['image', 'label'], num_rows: 13 }) Test subset: Dataset({ features: ['image', 'label'], num_rows: 5 }) {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=360x361 at 0x1345CDF40>, 'label': 0, 'pixel_values': <PIL.Image.Image image mode=RGB size=128x128 at 0x1345CF0E0>}
Out[10]:
Based on the Dictionary¶
In [11]:
data_dir = os.path.join(".", "data", "pokemon") # Directory for downloading the dataset
# Function to get paths to images
def get_image_paths(data_dir, split):
split_dir = Path(data_dir) / Path(split)
return list(split_dir.rglob("*.jpg"))
# Paths to images
train_images = get_image_paths(data_dir, "train")
test_images = get_image_paths(data_dir, "test")
dataset_train = Dataset.from_dict({"image": list(map(str, train_images))}).cast_column("image", Image())
dataset_test = Dataset.from_dict({"image": list(map(str, test_images))}).cast_column("image", Image())
compose = v2.Compose([
v2.Resize(size = (128, 128), antialias = True)
])
def transforms(examples: dict):
# Apply transformations to all images (img) and save the results (pixel_values)
examples["pixel_values"] = list(map(lambda image: F.rotate(compose(image), angle = -45), examples["image"]))
return examples # Result
# Applying the transform function to the training subset
dataset_train.set_transform(transforms)
print("Train subset:\n", dataset_train, "\n")
print("Test subset:\n", dataset_test, "\n")
# Getting the first row of data
print(dataset_train[0])
dataset_train[0]["pixel_values"]
Train subset: Dataset({ features: ['image'], num_rows: 13 }) Test subset: Dataset({ features: ['image'], num_rows: 5 }) {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=360x325 at 0x13DC8B200>, 'pixel_values': <PIL.Image.Image image mode=RGB size=128x128 at 0x13DC8B140>}
Out[11]:
Create/Read General Information Stored in the DatasetInfo¶
In [12]:
dataset_info_dir = os.path.join(".", "data", "pokemon") # Directory with dataset
dataset_train.info.write_to_directory(dataset_info_dir = dataset_info_dir)
ds_info = DatasetInfo.from_directory(dataset_info_dir = dataset_info_dir)
print(ds_info)
DatasetInfo(description='', citation='', homepage='', license='', features={'image': Image(mode=None, decode=True, id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)
Add a Column¶
In [13]:
column = list(range(1, dataset_train.num_rows + 1))
new_dataset_train = dataset_train.add_column(name = "numbers", column = column)
new_dataset_train
Out[13]:
Dataset({ features: ['image', 'numbers'], num_rows: 13 })
Add a Row¶
In [14]:
# Array of random pixels
random_image_array = np.random.randint(0, 256, (128, 128, 3), dtype = np.uint8)
# Array image
random_image = PILImage.fromarray(random_image_array)
with io.BytesIO() as output:
random_image.save(output, format = "PNG")
image_bytes = output.getvalue()
new_item = {"image": image_bytes}
new_dataset_train = dataset_train.add_item(new_item)
new_dataset_train[-1]["image"]
Out[14]:
View the Data Structure Stored as an Apache Arrow Table¶
In [15]:
dataset_train.data
Out[15]:
InMemoryTable image: struct<bytes: binary, path: string> child 0, bytes: binary child 1, path: string ---- image: [ -- is_valid: all not null -- child 0 type: binary [null,null,null,null,null,...,null,null,null,null,null] -- child 1 type: string ["data/pokemon/train/accelgor/0.jpg","data/pokemon/train/absol/2.jpg","data/pokemon/train/absol/1.jpg","data/pokemon/train/absol/0.jpg","data/pokemon/train/abra/2.jpg",...,"data/pokemon/train/aegislash/1.jpg","data/pokemon/train/aegislash/0.jpg","data/pokemon/train/abomasnow/2.jpg","data/pokemon/train/abomasnow/1.jpg","data/pokemon/train/abomasnow/0.jpg"]]
In [16]:
print("Number of columns:", dataset_train.num_columns)
print("Number of rows:", dataset_train.num_rows)
Number of columns: 1 Number of rows: 13
Variant 2¶
In [17]:
print("Number of columns:", dataset_train.shape[1])
print("Number of rows:", dataset_train.shape[0])
Number of columns: 1 Number of rows: 13
Column Headers¶
In [18]:
new_dataset_train.column_names
Out[18]:
['image']