Version of Python¶
!python -V
Python 3.12.6
Import Required Packages¶
# Suppress warnings
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)
import os
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import jupyterlab as jlab
Versions of Required Libraries¶
packages = [
"Torch", "NumPy", "Pandas", "JupyterLab",
]
package_objects = [
torch, np, pd, jlab
]
versions = list(map(lambda obj: obj.__version__, package_objects))
pkgs = {"Package": packages, "Version": versions}
df_pkgs = pd.DataFrame(data = pkgs)
df_pkgs.index.name = "#"
df_pkgs.index += 1
display(df_pkgs)
path_to_reqs = "."
reqs_name = "requirements.txt"
def get_packages_and_versions():
"""Generate strings with libraries and their versions in the format: package==version"""
for package, version in zip(packages, versions):
yield f"{package.lower()}=={version}\n"
with open(os.path.join(path_to_reqs, reqs_name), "w", encoding = "utf-8") as f:
f.writelines(get_packages_and_versions())
Package | Version | |
---|---|---|
# | ||
1 | Torch | 2.2.2 |
2 | NumPy | 1.26.4 |
3 | Pandas | 2.2.3 |
4 | JupyterLab | 4.2.5 |
ReLU (Rectified Linear Unit)¶
Replace all negative values with 0, leaving positive values unchanged
$$ \text{ReLU}(x) = \begin{cases} 0 & \text{if } x \leq 0 \\ x & \text{if } x > 0 \end{cases} $$
# Create a ReLU activation object
g = nn.ReLU()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the ReLU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([0.0000, 1.1458])
ELU (Exponential Linear Unit)¶
Convert negative values to $\alpha (\exp(x) - 1)$, making them less aggressive than ReLU, while leaving positive values unchanged
$$ \text{ELU}(x) = \begin{cases} x & \text{if } x > 0 \\ \alpha (\exp(x) - 1) & \text{if } x \leq 0 \end{cases} $$
# Create a ELU activation object
g = nn.ELU(alpha = 1.0)
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the ELU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-0.9049, 1.1458])
PReLU (Parametric ReLU)¶
Value conversion, leaving positive values unchanged and multiplying negative values by the training parameter $\alpha$ $\text{where } \alpha \text{ - trained parameter}$
$$ \text{PReLU}(x) = \begin{cases} x & \text{if } x \geq 0 \\ \alpha x & \text{if } x < 0 \end{cases} $$
# Create a PReLU activation object
g = nn.PReLU(num_parameters = 4, init = 0.25)
# Create a random tensor
input = torch.tensor([
[ 0.6465, -0.9450, -0.5559, -1.5250],
[-1.4968, -1.1030, 0.5872, -0.7036]
]) # torch.randn(2, 4)
# Apply the PReLU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([[ 0.6465, -0.9450, -0.5559, -1.5250], [-1.4968, -1.1030, 0.5872, -0.7036]]) Output data: tensor([[ 0.6465, -0.2362, -0.1390, -0.3812], [-0.3742, -0.2758, 0.5872, -0.1759]], grad_fn=<PreluKernelBackward0>)
LeakyReLU¶
Convert values by leaving positive values unchanged and multiplying negative values by the factor $\alpha$, $\text{where } \alpha = \text{negative_slope}$
$$ \text{LeakyReLU}(x) = \begin{cases} x & \text{if } x \geq 0 \\ \alpha x & \text{if } x < 0 \end{cases} $$
# Create a LeakyReLU activation object
g = nn.LeakyReLU(negative_slope = 0.01)
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the LeakyReLU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-0.0235, 1.1458])
ReLU6¶
Convert values less than 0 to 0, greater than 6 to 6, and leave other values unchanged
$$ \text{ReLU6}(x) = \min(\max(0, x), 6) $$
# Create a ReLU6 activation object
g = nn.ReLU6()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458, 6.2345]) # torch.randn(3)
# Apply the ReLU6 activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458, 6.2345]) Output data: tensor([0.0000, 1.1458, 6.0000])
RReLU (Randomized Leaky ReLU)¶
Convert values by leaving positive values unchanged and multiplying negative values by a random factor $\alpha$
$$ \text{RReLU}(x) = \begin{cases} x & \text{if } x \geq 0 \\ \alpha x & \text{if } x < 0 \end{cases} $$
# Create a RReLU activation object
g = nn.RReLU(lower = 0.125, upper = 0.333)
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458, 6.2345]) # torch.randn(3)
# Apply the RReLU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458, 6.2345]) Output data: tensor([-0.7478, 1.1458, 6.2345])
SELU (Scaled Exponential Linear Unit)¶
Convert values, leaving positive values unchanged and scaling them with the $scale$ parameter, and scaling negative values with the $\alpha$ and $scale$ parameters $\text{where } \text{scale} \approx 1.0507 \text{ and } \alpha \approx 1.6733$
$$ \text{SELU}(x) = \text{scale} \times (\max(0, x) + \min(0, \alpha \times (\exp(x) - 1))) $$
# Create a SELU activation object
g = nn.SELU()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the SELU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-1.5909, 1.2039])
CELU (Continuously Differentiable Exponential Linear Unit)¶
Convert negative values using the exponential function and the $\alpha$ parameter, leaving positive values unchanged
$$ \text{CELU}(x) = \begin{cases} x & \text{if } x \geq 0 \\ \alpha \times (\exp(x / \alpha) - 1) & \text{if } x < 0 \end{cases} $$
# Create a CELU activation object
g = nn.CELU(alpha = 1.0)
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the CELU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-0.9049, 1.1458])
GELU (Gaussian Error Linear Unit)¶
Transform values by approximating them to a Gaussian distribution
$$ \text{GELU}(x) = x \times \Phi(x) \quad \text{where } \Phi(x) \text{ - cumulative distribution function for normal distribution (}approximate=none\text{)} $$
$$ \text{GELU}(x) = 0.5 \times x \times \left(1 + \tanh\left(\sqrt{\frac{2}{\pi}} \left(x + 0.044715 x^3\right)\right)\right) \quad \text{where } approximate=tanh $$
# Create a GELU activation object
g = nn.GELU(approximate = "none") # none | tanh
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the GELU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-0.0219, 1.0015])
# Create a Sigmoid activation object
g = nn.Sigmoid()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the Sigmoid activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([0.0869, 0.7587])
SiLU (Sigmoid Linear Unit)¶
Combining the values of $x$ with their sigmoid transform $\sigma(x) = \frac{1}{1 + e^{-x}}$
$$ \text{SiLU}(x) = x \times \sigma(x) \quad \text{where } \sigma(x) = \frac{1}{1 + e^{-x}} \text{ - sigmoid function} $$
# Create a SiLU activation object
g = nn.SiLU()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the SiLU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-0.2043, 0.8694])
LogSigmoid¶
Convert values using the logarithm of a sigmoid function
$$ \text{LogSigmoid}(x) = \log \left( \frac{1}{1 + \exp(-x)} \right) $$
# Create a LogSigmoid activation object
g = nn.LogSigmoid()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the LogSigmoid activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-2.4435, -0.2761])
Hardsigmoid¶
Convert values less than or equal to $-3$ to 0, values greater than or equal to $3$ to 1, and values between $-3$ and $3$ to $\frac{x}{6} + 0.5$
$$ \text{Hardsigmoid}(x) = \begin{cases} 0 & \text{if } x \leq -3 \\ 1 & \text{if } x \geq 3 \\ \frac{x}{6} + 0.5 & \text{otherwise} \end{cases} $$
# Create a Hardsigmoid activation object
g = nn.Hardsigmoid()
# Create a random tensor
input = torch.tensor([-3.3526, 3.1458, -2.0256, 1.7843]) # torch.randn(4)
# Apply the Hardsigmoid activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-3.3526, 3.1458, -2.0256, 1.7843]) Output data: tensor([0.0000, 1.0000, 0.1624, 0.7974])
Tanh¶
Convert values from -1 to 1
$$ \text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)} $$
# Create a Tanh activation object
g = nn.Tanh()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the Tanh activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-0.9821, 0.8164])
Tanhshrink¶
Value transformation where the hyperbolic $tanh$ is subtracted from the input values themselves
$$ \text{Tanhshrink}(x) = x - \tanh(x) $$
# Create a Tanhshrink activation object
g = nn.Tanhshrink()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the Tanhshrink activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-1.3705, 0.3294])
Hardtanh¶
Convert values less than $min\_val$ to $min\_val$, values greater than $max\_val$ to $max\_val$, and leave values between $min\_val$ and $max\_val$ unchanged
$$ \text{HardTanh}(x) = \begin{cases} min\_val & \text{if } x < min\_val \\ max\_val & \text{if } x > max\_val \\ x & \text{otherwise} \end{cases} $$
# Create a Hardtanh activation object
g = nn.Hardtanh(min_val = -1.0, max_val = 1.0)
# Create a random tensor
input = torch.tensor([-1.1383, 1.1630, -0.8715, 0.7228]) # torch.randn(4)
# Apply the Hardtanh activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-1.1383, 1.1630, -0.8715, 0.7228]) Output data: tensor([-1.0000, 1.0000, -0.8715, 0.7228])
Hardshrink¶
Convert values between $-\lambda$ and $\lambda$ inclusive to 0, and leave values less than $-\lambda$ or greater than $\lambda$ unchanged
$$ \text{HardShrink}(x) = \begin{cases} x & \text{if } x > \lambda \\ x & \text{if } x < -\lambda \\ 0 & \text{otherwise} \end{cases} $$
# Create a Hardshrink activation object
g = nn.Hardshrink(lambd = 1.1458)
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the Hardshrink activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-2.3526, 0.0000])
Hardswish¶
Convert values less than or equal to $-3$ to 0, values greater than or equal to $3$ remain unchanged, and values between $-3$ and $3$ are converted using the function $x \left(\frac{x+3}{6}\right)$
$$ \text{Hardswish}(x) = \begin{cases} 0 & \text{if } x \leq -3 \\ x & \text{if } x \geq 3 \\ x \left(\frac{x+3}{6}\right) & \text{otherwise} \end{cases} $$
# Create a Hardswish activation object
g = nn.Hardswish()
# Create a random tensor
input = torch.tensor([-3.3526, 3.1458, -2.0256, 1.7843]) # torch.randn(4)
# Apply the Hardswish activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-3.3526, 3.1458, -2.0256, 1.7843]) Output data: tensor([-0.0000, 3.1458, -0.3290, 1.4228])
Mish¶
Convert values by combining $x$ values with their $softplus$ transformations ($\text{Softplus}(x) = \frac{1}{\beta} \log(1 + \exp(\beta \times x))$) and applying the hyperbolic tangent
$$ \text{Mish}(x) = x \times \tanh(\text{Softplus}(x)) $$
# Create a Mish activation object
g = nn.Mish()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the Mish activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([-0.2132, 1.0198])
Softplus¶
Convert values by smoothing, providing a smooth transition between linear and non-linear regions, controlled by the $\beta$ parameter
$$ \text{Softplus}(x) = \frac{1}{\beta} \log(1 + \exp(\beta \times x)) $$
# Create a Softplus activation object
g = nn.Softplus(beta = 1.0, threshold = 20.0)
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458]) # torch.randn(2)
# Apply the Softplus activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458]) Output data: tensor([0.0909, 1.4219])
Softshrink¶
Convert values by decreasing them by $\lambda$ and setting them to zero if the absolute value is less than $\lambda$
$$ \text{Softshrink}(x) = \begin{cases} x - \lambda & \text{if } x > \lambda \\ x + \lambda & \text{if } x < -\lambda \\ 0 & \text{otherwise} \end{cases} $$
# Create a Softshrink activation object
g = nn.Softshrink(lambd = 0.5)
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458, 0.4320, -0.3791]) # torch.randn(4)
# Apply the Softshrink activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458, 0.4320, -0.3791]) Output data: tensor([-1.8526, 0.6458, 0.0000, 0.0000])
Softsign¶
Transform values by decreasing their magnitude, where the input values are divided by the sum of 1 and their absolute value
$$ \text{Softsign}(x) = \frac{x}{1 + |x|} $$
# Create a Softsign activation object
g = nn.Softsign()
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458, 0.4320, -0.3791]) # torch.randn(4)
# Apply the Softsign activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458, 0.4320, -0.3791]) Output data: tensor([-0.7017, 0.5340, 0.3017, -0.2749])
Threshold¶
Value conversion, where values greater than the specified $threshold$ remain unchanged and values less than the threshold are replaced with the specified $value$
$$ \text{Threshold}(x) = \begin{cases} x & \text{if } x > threshold \\ value & \text{otherwise} \end{cases} $$
# Create a Threshold activation object
g = nn.Threshold(threshold = 0.1, value = 20)
# Create a random tensor
input = torch.tensor([-2.3526, 1.1458, 0.4320, -0.3791]) # torch.randn(4)
# Apply the Threshold activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([-2.3526, 1.1458, 0.4320, -0.3791]) Output data: tensor([20.0000, 1.1458, 0.4320, 20.0000])
GLU (Gated Linear Unit)¶
A value transformation in which the data is divided into two halves by the last dimension, the first half is left unchanged, a sigmoid function is applied to the second half, and multiplication of these parts gives the result
$$ \text{GLU}(x) = x_1 \times \sigma(x_2) $$
# Create a GLU activation object
g = nn.GLU(dim = -1)
# Create a random tensor
input = torch.tensor([
[-0.0915, 0.2352],
[ 2.2440, 0.5817],
[ 0.4528, 0.6410],
[ 0.5200, 0.5567]
]) # torch.randn(4, 2)
# Apply the GLU activation function to the input data
output = g(input)
# Print the input and output data
print("Input data:", input)
print("Output data:", output)
Input data: tensor([[-0.0915, 0.2352], [ 2.2440, 0.5817], [ 0.4528, 0.6410], [ 0.5200, 0.5567]]) Output data: tensor([[-0.0511], [ 1.4394], [ 0.2966], [ 0.3306]])
MultiheadAttention¶
Value conversion using a multi-headed attention mechanism, where the $query$, $key$, and $value$ inputs are passed through multiple heads of attention, combined, and linearly transformed into a final result
$$ \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \dots, \text{head}_h) W^O \quad \text{where } \text{head}_i = \text{Attention}(Q W_i^Q, K W_i^K, V W_i^V) $$
# Parameters
embed_dim = 3 # Embed dimension
num_heads = 1 # Number of heads
# Create a MultiheadAttention object
multihead_attn = nn.MultiheadAttention(
embed_dim = embed_dim,
num_heads = num_heads,
dropout = 0.0,
bias = True,
add_bias_kv = False,
batch_first = True
)
# Create a random tensor for query, key и value
query = torch.tensor([
[
[-1.4025, 0.4318, 0.3431],
[ 1.0711, 1.3455, 0.3277]],
[
[ 1.3409, 1.2159, 0.9589],
[ 0.5137, 0.4977, -0.6646]],
[
[-1.0612, 2.0423, 0.6509],
[-1.0072, 0.3578, -1.0799]
]
]) # (batch size, target sequence length, embed_dim)
key = torch.tensor([
[
[-1.4025, 0.4318, 0.3431],
[ 1.0711, 1.3455, 0.3277]],
[
[ 1.3409, 1.2159, 0.9589],
[ 0.5137, 0.4977, -0.6646]],
[
[-1.0612, 2.0423, 0.6509],
[-1.0072, 0.3578, -1.0799]
]
]) # (batch size, source sequence length, embed_dim)
value = torch.tensor([
[
[-1.4025, 0.4318, 0.3431],
[ 1.0711, 1.3455, 0.3277]],
[
[ 1.3409, 1.2159, 0.9589],
[ 0.5137, 0.4977, -0.6646]],
[
[-1.0612, 2.0423, 0.6509],
[-1.0072, 0.3578, -1.0799]
]
]) # (batch size, source sequence length, embed_dim)
# Apply the MultiheadAttention activation function to the input data
# attn_output = (batch size, target sequence length, embed_dim)
# attn_output_weights = (batch size, target sequence length, source sequence length)
attn_output, attn_output_weights = multihead_attn(query, key, value)
# Print the output data
print("Attention result:", attn_output)
print("Attention Weights:", attn_output_weights)
Attention result: tensor([[[ 0.2210, 0.3523, -0.3251], [ 0.2292, 0.4314, -0.3232]], [[ 0.0753, -0.0731, -0.1695], [ 0.0983, -0.0579, -0.1872]], [[-0.2672, 0.2664, 0.0135], [-0.0246, 0.5786, -0.2260]]], grad_fn=<TransposeBackward0>) Attention Weights: tensor([[[0.3609, 0.6391], [0.4675, 0.5325]], [[0.4914, 0.5086], [0.5231, 0.4769]], [[0.1311, 0.8689], [0.3989, 0.6011]]], grad_fn=<MeanBackward1>)