Version of Python¶

In [1]:
!python -V
Python 3.12.6

Import Required Packages¶

In [2]:
# Suppress warnings
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

import io
import os
import numpy as np
import torch
import torch.nn as nn
import torchvision as tv
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
import pandas as pd
import datasets
import PIL
import jupyterlab as jlab
import ipywidgets
import time

from pathlib import Path
from dataclasses import dataclass
from datasets import load_dataset_builder, load_dataset, get_dataset_split_names, get_dataset_config_names, Dataset, Image, DatasetInfo
from torchvision.transforms import v2
from PIL import Image as PILImage

Versions of Required Libraries¶

In [3]:
packages = [
    "Torch", "TorchVision", "NumPy", "Pandas", "Datasets", "Pillow", "JupyterLab", "IPyWidgets"
]

package_objects = [
    torch, tv, np, pd, datasets, PIL, jlab, ipywidgets
]

versions = list(map(lambda obj: obj.__version__, package_objects))

pkgs = {"Package": packages, "Version": versions}
df_pkgs = pd.DataFrame(data = pkgs)
df_pkgs.index.name = "#"
df_pkgs.index += 1

display(df_pkgs)

path_to_reqs = "."
reqs_name = "requirements.txt"

def get_packages_and_versions():
    """Generate strings with libraries and their versions in the format: package==version"""
    
    for package, version in zip(packages, versions):
        yield f"{package.lower()}=={version}\n"

with open(os.path.join(path_to_reqs, reqs_name), "w", encoding = "utf-8") as f:
    f.writelines(get_packages_and_versions())
Package Version
#
1 Torch 2.2.2
2 TorchVision 0.17.2
3 NumPy 1.26.4
4 Pandas 2.2.3
5 Datasets 3.0.0
6 Pillow 10.4.0
7 JupyterLab 4.2.5
8 IPyWidgets 8.1.5

Downloading a Dataset from the Hugging Face Hub¶

Before downloading a dataset, you can get general information stored in DatasetInfo, including description, functions, and dataset size

Checks Attributes of a Dataset without Downloading it¶

In [4]:
cache_dir = os.path.join(".", "data") # Directory for downloading the dataset

# Check the general information required to build the dataset without downloading the dataset itself
ds_builder = load_dataset_builder(path = "uoft-cs/cifar10", cache_dir = cache_dir, trust_remote_code = True)

# Information about the dataset
print(ds_builder.info, "\n")
print(ds_builder.info.features)
DatasetInfo(description='', citation='', homepage='', license='', features={'img': Image(mode=None, decode=True, id=None), 'label': ClassLabel(names=['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'], id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='cifar10', config_name='plain_text', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=113861310, num_examples=50000, shard_lengths=None, dataset_name='cifar10'), 'test': SplitInfo(name='test', num_bytes=22774180, num_examples=10000, shard_lengths=None, dataset_name='cifar10')}, download_checksums={'hf://datasets/uoft-cs/cifar10@0b2714987fa478483af9968de7c934580d0bb9a2/plain_text/train-00000-of-00001.parquet': {'num_bytes': 119705255, 'checksum': None}, 'hf://datasets/uoft-cs/cifar10@0b2714987fa478483af9968de7c934580d0bb9a2/plain_text/test-00000-of-00001.parquet': {'num_bytes': 23940850, 'checksum': None}}, download_size=143646105, post_processing_size=None, dataset_size=136635490, size_in_bytes=280281595) 

{'img': Image(mode=None, decode=True, id=None), 'label': ClassLabel(names=['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'], id=None)}

List of Possible Configurations Available for the Dataset¶

In [5]:
configs = get_dataset_config_names(path = "uoft-cs/cifar10", trust_remote_code = True)

print(configs)
['plain_text']

Available Data Subsets¶

In [6]:
dataset_split_names = get_dataset_split_names(path = "uoft-cs/cifar10", trust_remote_code = True)

print(dataset_split_names)
['train', 'test']

Downloading a Dataset¶

In [7]:
dataset_train, dataset_test = load_dataset(
    path = "uoft-cs/cifar10",
    name = "plain_text", # see configs
    split = dataset_split_names,
    cache_dir = cache_dir,
    trust_remote_code = True
)

print("Train subset:\n", dataset_train, "\n")
print("Test subset:\n", dataset_test)
Train subset:
 Dataset({
    features: ['img', 'label'],
    num_rows: 50000
}) 

Test subset:
 Dataset({
    features: ['img', 'label'],
    num_rows: 10000
})

Indexing a Dataset¶

Dataset contains columns of different data types

In [8]:
# Getting the first row of data
print(dataset_train[0], "\n")

# Getting the last row of data
print(dataset_train[-1], "\n")

# Get a subset of rows of data
print(dataset_train[3:7], "\n")

start_time = time.time()
# Get a part of the rows from a specified column of a data
print(dataset_train[:10]["img"])
end_time = time.time()
print(f"Runtime: {end_time - start_time:.4f} seconds")
{'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x135482CC0>, 'label': 0} 

{'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x135481E20>, 'label': 5} 

{'img': [<PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3EFC0>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3F230>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FA70>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FE00>], 'label': [2, 7, 2, 1]} 

[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FEF0>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FF50>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC3FF80>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64080>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64050>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC640B0>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64140>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64200>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC64170>, <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC641D0>]
Runtime: 0.0024 seconds

Apply Transformations to Images¶

In [9]:
compose = v2.Compose([
    v2.RandomResizedCrop(size = (64, 64), antialias = True),
    v2.RandomRotation(degrees = (0, 90))
])

def transforms(examples: dict):
    # Apply transformations to all images (img) and save the results (pixel_values)
    examples["pixel_values"] = list(map(compose, examples["img"]))

    return examples # Result

# Applying the transform function to the training subset
dataset_train.set_transform(transforms)

# Getting a row
print(dataset_train[1])
dataset_train[1]["pixel_values"]
{'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x13DC680E0>, 'label': 6, 'pixel_values': <PIL.Image.Image image mode=RGB size=64x64 at 0x13DC68260>}
Out[9]:
No description has been provided for this image

Creating a Dataset¶

The dataset is partitioned based on the root directory structure, and labels are automatically created from the names of the subdirectories

$$ dataset/{train|test}/{class}/{filename} $$

  • ${train|test}$ — training, test subset
  • ${class}$ — class
  • ${filename}$ — image or audio file

Directory-based¶

In [10]:
data_dir = os.path.join(".", "data", "pokemon") # Directory for downloading the dataset

dataset = load_dataset(
    path = "imagefolder",
    data_dir = data_dir
)
dataset_train, dataset_test = dataset['train'], dataset['test']

compose = v2.Compose([
    v2.Resize(size = (128, 128), antialias = True)
])

def transforms(examples: dict):
    # Apply transformations to all images (img) and save the results (pixel_values)
    examples["pixel_values"] = list(map(lambda image: F.rotate(compose(image), angle = 45), examples["image"]))

    return examples # Result

# Applying the transform function to the training subset
dataset_train.set_transform(transforms)

print("Train subset:\n", dataset_train, "\n")
print("Test subset:\n", dataset_test, "\n")

# Getting the first row of data
print(dataset_train[0])

dataset_train[0]["pixel_values"]
Train subset:
 Dataset({
    features: ['image', 'label'],
    num_rows: 13
}) 

Test subset:
 Dataset({
    features: ['image', 'label'],
    num_rows: 5
}) 

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=360x361 at 0x1345CDF40>, 'label': 0, 'pixel_values': <PIL.Image.Image image mode=RGB size=128x128 at 0x1345CF0E0>}
Out[10]:
No description has been provided for this image

Based on the Dictionary¶

In [11]:
data_dir = os.path.join(".", "data", "pokemon") # Directory for downloading the dataset

# Function to get paths to images
def get_image_paths(data_dir, split):
    split_dir = Path(data_dir) / Path(split)
    return list(split_dir.rglob("*.jpg"))

# Paths to images
train_images = get_image_paths(data_dir, "train")
test_images = get_image_paths(data_dir, "test")

dataset_train = Dataset.from_dict({"image": list(map(str, train_images))}).cast_column("image", Image())
dataset_test = Dataset.from_dict({"image": list(map(str, test_images))}).cast_column("image", Image())

compose = v2.Compose([
    v2.Resize(size = (128, 128), antialias = True)
])

def transforms(examples: dict):
    # Apply transformations to all images (img) and save the results (pixel_values)
    examples["pixel_values"] = list(map(lambda image: F.rotate(compose(image), angle = -45), examples["image"]))

    return examples # Result

# Applying the transform function to the training subset
dataset_train.set_transform(transforms)

print("Train subset:\n", dataset_train, "\n")
print("Test subset:\n", dataset_test, "\n")

# Getting the first row of data
print(dataset_train[0])

dataset_train[0]["pixel_values"]
Train subset:
 Dataset({
    features: ['image'],
    num_rows: 13
}) 

Test subset:
 Dataset({
    features: ['image'],
    num_rows: 5
}) 

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=360x325 at 0x13DC8B200>, 'pixel_values': <PIL.Image.Image image mode=RGB size=128x128 at 0x13DC8B140>}
Out[11]:
No description has been provided for this image

Create/Read General Information Stored in the DatasetInfo¶

In [12]:
dataset_info_dir = os.path.join(".", "data", "pokemon") # Directory with dataset

dataset_train.info.write_to_directory(dataset_info_dir = dataset_info_dir)

ds_info = DatasetInfo.from_directory(dataset_info_dir = dataset_info_dir)

print(ds_info)
DatasetInfo(description='', citation='', homepage='', license='', features={'image': Image(mode=None, decode=True, id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

Add a Column¶

In [13]:
column = list(range(1, dataset_train.num_rows + 1))

new_dataset_train = dataset_train.add_column(name = "numbers", column = column)

new_dataset_train
Out[13]:
Dataset({
    features: ['image', 'numbers'],
    num_rows: 13
})

Add a Row¶

In [14]:
# Array of random pixels
random_image_array = np.random.randint(0, 256, (128, 128, 3), dtype = np.uint8)
# Array image
random_image = PILImage.fromarray(random_image_array)

with io.BytesIO() as output:
    random_image.save(output, format = "PNG")
    image_bytes = output.getvalue()

new_item = {"image": image_bytes}

new_dataset_train = dataset_train.add_item(new_item)

new_dataset_train[-1]["image"]
Out[14]:
No description has been provided for this image

View the Data Structure Stored as an Apache Arrow Table¶

In [15]:
dataset_train.data
Out[15]:
InMemoryTable
image: struct<bytes: binary, path: string>
  child 0, bytes: binary
  child 1, path: string
----
image: [
  -- is_valid: all not null
  -- child 0 type: binary
[null,null,null,null,null,...,null,null,null,null,null]
  -- child 1 type: string
["data/pokemon/train/accelgor/0.jpg","data/pokemon/train/absol/2.jpg","data/pokemon/train/absol/1.jpg","data/pokemon/train/absol/0.jpg","data/pokemon/train/abra/2.jpg",...,"data/pokemon/train/aegislash/1.jpg","data/pokemon/train/aegislash/0.jpg","data/pokemon/train/abomasnow/2.jpg","data/pokemon/train/abomasnow/1.jpg","data/pokemon/train/abomasnow/0.jpg"]]

Number of Columns/Rows¶

Variant 1¶

In [16]:
print("Number of columns:", dataset_train.num_columns)
print("Number of rows:", dataset_train.num_rows)
Number of columns: 1
Number of rows: 13

Variant 2¶

In [17]:
print("Number of columns:", dataset_train.shape[1])
print("Number of rows:", dataset_train.shape[0])
Number of columns: 1
Number of rows: 13

Column Headers¶

In [18]:
new_dataset_train.column_names
Out[18]:
['image']