🔧 Setup Environnement IA
Créer un environnement Conda
# Créer environnement avec Python 3.10
conda create -n ia-env python=3.10
conda activate ia-env
# Installer PyTorch avec CUDA 11.8
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
# Ou avec pip
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
Vérifier CUDA
import torch
# Vérifier disponibilité CUDA
print(f"CUDA disponible: {torch.cuda.is_available()}")
print(f"Version CUDA: {torch.version.cuda}")
print(f"Nombre de GPUs: {torch.cuda.device_count()}")
print(f"GPU actuel: {torch.cuda.get_device_name(0)}")
# Tester calcul GPU
x = torch.rand(1000, 1000).cuda()
y = torch.rand(1000, 1000).cuda()
z = x @ y # Multiplication sur GPU
WSL2 pour Windows
# Installation WSL2
wsl --install
wsl --set-default-version 2
# Installer Ubuntu
wsl --install -d Ubuntu-22.04
# Dans WSL, installer conda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
# Installer CUDA Toolkit dans WSL
wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin
sudo mv cuda-wsl-ubuntu.pin /etc/apt/preferences.d/cuda-repository-pin-600
💡 Astuce: Utilisez conda env export > environment.yml pour sauvegarder votre environnement et le partager.
🐍 Python Essentials
Decorators
import time
from functools import wraps
def timer(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
print(f"{func.__name__} took {time.time()-start:.4f}s")
return result
return wrapper
@timer
def train_model(epochs):
# Code d'entraînement
time.sleep(2)
# Property decorator
class Model:
def __init__(self):
self._weights = None
@property
def weights(self):
return self._weights
@weights.setter
def weights(self, value):
self._weights = value
Generators
def data_generator(batch_size, num_batches):
"""Generator pour batches de données"""
for i in range(num_batches):
# Générer batch
batch = torch.randn(batch_size, 784)
labels = torch.randint(0, 10, (batch_size,))
yield batch, labels
# Utilisation
for batch, labels in data_generator(32, 100):
# Traiter batch
pass
# Generator expression
squares = (x**2 for x in range(1000000)) # Économise mémoire
Type Hints
from typing import List, Dict, Tuple, Optional, Union
import numpy as np
def process_data(
data: np.ndarray,
labels: Optional[np.ndarray] = None,
batch_size: int = 32
) -> Tuple[np.ndarray, Dict[str, float]]:
"""Traite les données et retourne résultats + métriques"""
processed = data * 2.0
metrics = {"mean": float(data.mean()), "std": float(data.std())}
return processed, metrics
# Type pour modèle
from torch import nn
ModelType = nn.Module
OptimizerType = Union[torch.optim.Adam, torch.optim.SGD]
Comprehensions
# List comprehension
squares = [x**2 for x in range(10) if x % 2 == 0]
# Dict comprehension
metrics = {f"layer_{i}": torch.randn(10) for i in range(5)}
# Set comprehension
unique_labels = {label for batch in dataset for label in batch}
# Nested comprehension
matrix = [[i*j for j in range(5)] for i in range(5)]
🔢 NumPy Operations
Array Creation
import numpy as np
# Différentes façons de créer arrays
a = np.array([1, 2, 3, 4, 5])
b = np.zeros((3, 4))
c = np.ones((2, 3, 4))
d = np.arange(0, 10, 0.5) # 0 à 10 par pas de 0.5
e = np.linspace(0, 1, 100) # 100 valeurs entre 0 et 1
f = np.random.randn(5, 5) # Distribution normale
g = np.eye(4) # Matrice identité 4x4
h = np.full((3, 3), 7) # Matrice remplie de 7
Reshaping
# Reshape
x = np.arange(12)
x_reshaped = x.reshape(3, 4) # 3x4
x_auto = x.reshape(-1, 2) # Calcul auto: 6x2
# Transpose
x_t = x_reshaped.T
x_swap = x.reshape(2, 3, 2).swapaxes(0, 1)
# Flatten
x_flat = x_reshaped.flatten() # Copie
x_ravel = x_reshaped.ravel() # Vue si possible
# Add dimension
x_expanded = x[np.newaxis, :] # (1, 12)
x_expanded2 = x[:, np.newaxis] # (12, 1)
Broadcasting
# Broadcasting examples
a = np.array([[1, 2, 3], [4, 5, 6]]) # (2, 3)
b = np.array([10, 20, 30]) # (3,)
c = a + b # Broadcast b sur chaque ligne
# Normalisation avec broadcasting
X = np.random.randn(100, 10)
mean = X.mean(axis=0, keepdims=True) # (1, 10)
std = X.std(axis=0, keepdims=True) # (1, 10)
X_normalized = (X - mean) / std
# Broadcasting pour distance matrices
points = np.random.randn(50, 3) # 50 points en 3D
distances = np.sqrt(((points[:, np.newaxis] - points) ** 2).sum(axis=2))
Einsum
# Einsum: notation Einstein pour opérations tensorielles
# Dot product
a = np.random.randn(5)
b = np.random.randn(5)
dot = np.einsum('i,i->', a, b) # Équivalent à np.dot(a, b)
# Matrix multiplication
A = np.random.randn(3, 4)
B = np.random.randn(4, 5)
C = np.einsum('ij,jk->ik', A, B) # A @ B
# Batch matrix multiplication
batch_A = np.random.randn(10, 3, 4)
batch_B = np.random.randn(10, 4, 5)
batch_C = np.einsum('bij,bjk->bik', batch_A, batch_B)
# Trace d'une matrice
trace = np.einsum('ii->', A)
# Outer product
outer = np.einsum('i,j->ij', a, b)
⚡ Performance: Utilisez toujours des opérations vectorisées plutôt que des boucles Python. NumPy est jusqu'à 100x plus rapide!
🐼 Pandas Essentials
Read & Write CSV
import pandas as pd
# Lecture CSV
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', sep=';', encoding='utf-8',
parse_dates=['date'], index_col=0)
# Lecture avec chunks pour gros fichiers
for chunk in pd.read_csv('huge_file.csv', chunksize=10000):
process(chunk)
# Écriture CSV
df.to_csv('output.csv', index=False)
df.to_csv('output.csv', sep='\t', encoding='utf-8')
GroupBy
# GroupBy basique
grouped = df.groupby('category')['value'].mean()
# Multiple aggregations
agg_result = df.groupby('category').agg({
'value': ['mean', 'std', 'count'],
'price': ['min', 'max', 'median']
})
# Custom aggregation
def custom_metric(x):
return (x.max() - x.min()) / x.mean()
df.groupby('category')['value'].apply(custom_metric)
# Multiple groupby keys
df.groupby(['category', 'subcategory']).mean()
# Transform (garde shape originale)
df['normalized'] = df.groupby('category')['value'].transform(
lambda x: (x - x.mean()) / x.std()
)
Merge & Join
# Merge (comme SQL JOIN)
result = pd.merge(df1, df2, on='key', how='inner') # inner, left, right, outer
# Merge sur multiple colonnes
result = pd.merge(df1, df2, on=['key1', 'key2'])
# Merge avec suffixes
result = pd.merge(df1, df2, on='key', suffixes=('_left', '_right'))
# Join sur index
result = df1.join(df2, how='left')
# Concat (empile dataframes)
result = pd.concat([df1, df2], axis=0) # Vertical
result = pd.concat([df1, df2], axis=1) # Horizontal
Pivot Table
# Pivot table
pivot = df.pivot_table(
values='sales',
index='date',
columns='category',
aggfunc='sum',
fill_value=0
)
# Multiple aggregations
pivot_multi = df.pivot_table(
values='sales',
index='date',
columns='category',
aggfunc=['sum', 'mean', 'count']
)
# Pivot simple (pas d'aggregation)
pivoted = df.pivot(index='date', columns='category', values='sales')
Gestion valeurs manquantes
# Détecter NaN
df.isna().sum() # Nombre par colonne
df.isnull().any() # Colonnes avec NaN
# Remplir NaN
df.fillna(0) # Avec constante
df.fillna(method='ffill') # Forward fill
df.fillna(method='bfill') # Backward fill
df.fillna(df.mean()) # Avec moyenne
# Remplir par colonne
df['age'].fillna(df['age'].median(), inplace=True)
# Supprimer NaN
df.dropna() # Supprime lignes avec NaN
df.dropna(axis=1) # Supprime colonnes avec NaN
df.dropna(thresh=2) # Garde lignes avec au moins 2 non-NaN
🚀 Optimisation: Utilisez df.query() au lieu de df[df['col'] > 5] pour de meilleures performances sur gros datasets.
📊 Matplotlib & Plotly
Matplotlib Subplots
import matplotlib.pyplot as plt
import numpy as np
# Subplots basiques
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes[0, 0].plot(x, y)
axes[0, 0].set_title('Line Plot')
axes[0, 1].scatter(x, y, alpha=0.5)
axes[0, 1].set_title('Scatter')
axes[1, 0].hist(data, bins=30, edgecolor='black')
axes[1, 0].set_title('Histogram')
axes[1, 1].bar(categories, values)
axes[1, 1].set_title('Bar Chart')
plt.tight_layout()
plt.savefig('plots.png', dpi=300, bbox_inches='tight')
plt.show()
Heatmap & Confusion Matrix
import seaborn as sns
from sklearn.metrics import confusion_matrix
# Heatmap de corrélation
correlation = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm',
center=0, square=True, linewidths=1)
plt.title('Matrice de Corrélation')
plt.show()
# Confusion Matrix
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 1, 1]
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()
Plotly Interactive
import plotly.graph_objects as go
import plotly.express as px
# Line plot interactif
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1, mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=x, y=y2, mode='lines', name='Val'))
fig.update_layout(
title='Training History',
xaxis_title='Epoch',
yaxis_title='Loss',
hovermode='x unified'
)
fig.show()
# Plotly Express (plus simple)
df = px.data.iris()
fig = px.scatter(df, x='sepal_width', y='sepal_length',
color='species', size='petal_length',
hover_data=['petal_width'])
fig.show()
# 3D Scatter
fig = go.Figure(data=[go.Scatter3d(
x=X[:, 0], y=X[:, 1], z=X[:, 2],
mode='markers',
marker=dict(size=5, color=labels, colorscale='Viridis')
)])
fig.show()
Visualisation Métriques ML
from sklearn.metrics import roc_curve, auc
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,
label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
📓 Jupyter Tips
Magic Commands
# Mesurer temps d'exécution
%timeit sum(range(1000))
%%timeit
# Mesure tout le cell
result = expensive_computation()
# Matplotlib inline
%matplotlib inline
# Ou interactif
%matplotlib notebook
# Recharger modules automatiquement
%load_ext autoreload
%autoreload 2
# Afficher variables
%who # Liste variables
%whos # Détails variables
# Exécuter script Python
%run script.py
# Historique commandes
%history -n 1-10
# Debugging
%pdb # Active debugger sur erreur
Cell Magics
# %%time - Temps d'exécution cell
%%time
df = pd.read_csv('large_file.csv')
result = df.groupby('category').mean()
# %%writefile - Écrire dans fichier
%%writefile utils.py
def helper_function():
return "Hello from file"
# %%bash - Commandes bash
%%bash
ls -la
pwd
echo "Current directory"
# %%html - HTML custom
%%html
<div style="color: red;">Important!</div>
# %%javascript - JS dans notebook
%%javascript
alert('Hello from JS');
Extensions Utiles
# Installer extensions
pip install jupyter_contrib_nbextensions
jupyter contrib nbextension install --user
# Extensions recommandées:
# - Table of Contents: navigation facile
# - Variable Inspector: voir variables en temps réel
# - ExecuteTime: temps d'exécution cells
# - Collapsible Headings: replier sections
# - Code prettify: formatter code automatiquement
# JupyterLab extensions
jupyter labextension install @jupyterlab/toc
jupyter labextension install @jupyter-widgets/jupyterlab-manager
Keyboard Shortcuts
| Raccourci |
Action |
Shift + Enter |
Exécuter cell et passer à suivante |
Ctrl + Enter |
Exécuter cell |
A |
Insérer cell au dessus |
B |
Insérer cell en dessous |
DD |
Supprimer cell |
M |
Convertir en Markdown |
Y |
Convertir en Code |
Shift + M |
Fusionner cells |
💡 Pro Tip: Utilisez ?function_name pour afficher la docstring, et ??function_name pour voir le code source!
🔀 Git pour ML
Git LFS (Large File Storage)
# Installer Git LFS
git lfs install
# Tracker fichiers volumineux
git lfs track "*.pth"
git lfs track "*.h5"
git lfs track "*.pkl"
git lfs track "data/*.csv"
# Vérifier fichiers trackés
git lfs ls-files
# Cloner repo avec LFS
git lfs clone https://github.com/user/repo.git
# Pull fichiers LFS
git lfs pull
.gitignore pour ML
# Python
__pycache__/
*.py[cod]
*$py.class
.Python
env/
venv/
# Jupyter
.ipynb_checkpoints
*.ipynb_checkpoints/
# Data
data/raw/*
data/processed/*
!data/.gitkeep
# Models
models/*.pth
models/*.h5
models/*.pkl
!models/.gitkeep
# Logs & Experiments
logs/
runs/
mlruns/
wandb/
# Environment
.env
.env.local
credentials.json
# IDE
.vscode/
.idea/
*.swp
# OS
.DS_Store
Thumbs.db
DVC (Data Version Control)
# Installer DVC
pip install dvc
# Initialiser DVC
dvc init
# Ajouter remote storage (S3, GCS, Azure...)
dvc remote add -d storage s3://mybucket/dvc-storage
# Tracker données avec DVC
dvc add data/train.csv
git add data/train.csv.dvc data/.gitignore
git commit -m "Add training data"
# Pousser données vers remote
dvc push
# Pull données
dvc pull
# Créer pipeline ML
dvc run -n preprocess \
-d src/preprocess.py -d data/raw \
-o data/processed \
python src/preprocess.py
# Visualiser pipeline
dvc dag
Commits & Branches ML
# Branches pour expérimentations
git checkout -b experiment/transformer-v2
git checkout -b feature/data-augmentation
# Commit avec métriques
git commit -m "Train ResNet50 - Acc: 94.5% - Loss: 0.23"
# Tags pour versions modèles
git tag -a v1.0 -m "Model v1.0 - Production ready"
git push origin v1.0
# Stash pour sauvegarder WIP
git stash save "WIP: testing new architecture"
git stash list
git stash pop
# Rebase interactif pour nettoyer historique
git rebase -i HEAD~5
🎯 Best Practice: Ne jamais commiter les poids des modèles dans Git. Utilisez Git LFS ou DVC pour les fichiers > 100MB.
🧮 Algèbre Linéaire
Dot Product & Matrix Multiply
Dot Product: a · b = Σ aᵢbᵢ = |a||b|cos(θ)
# Dot product
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
dot = np.dot(a, b) # 32
# Matrix multiplication
A = np.random.randn(3, 4)
B = np.random.randn(4, 5)
C = A @ B # (3, 5)
C = np.matmul(A, B) # Équivalent
# Batch matrix multiplication
batch_A = np.random.randn(10, 3, 4)
batch_B = np.random.randn(10, 4, 5)
batch_C = batch_A @ batch_B # (10, 3, 5)
Eigenvalues & Eigenvectors
Équation propre: Av = λv
# Valeurs et vecteurs propres
A = np.random.randn(5, 5)
A = A @ A.T # Matrice symétrique
eigenvalues, eigenvectors = np.linalg.eig(A)
# Vérification
v = eigenvectors[:, 0]
lambda_v = eigenvalues[0]
assert np.allclose(A @ v, lambda_v * v)
# Diagonalisation: A = VΛV⁻¹
Lambda = np.diag(eigenvalues)
V = eigenvectors
A_reconstructed = V @ Lambda @ np.linalg.inv(V)
SVD (Singular Value Decomposition)
SVD: A = UΣVᵀ
# SVD
A = np.random.randn(100, 50)
U, s, Vt = np.linalg.svd(A, full_matrices=False)
# U: (100, 50), s: (50,), Vt: (50, 50)
# Reconstruction
Sigma = np.diag(s)
A_reconstructed = U @ Sigma @ Vt
# Compression avec SVD (garder k composantes)
k = 10
A_compressed = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]
# Variance expliquée
variance_explained = np.cumsum(s**2) / np.sum(s**2)
Normes & Distances
# Normes de vecteurs
v = np.array([3, 4])
# L1 norm (Manhattan)
l1 = np.linalg.norm(v, ord=1) # 7
# L2 norm (Euclidean)
l2 = np.linalg.norm(v, ord=2) # 5
# L-inf norm (maximum)
linf = np.linalg.norm(v, ord=np.inf) # 4
# Distance entre vecteurs
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
euclidean_dist = np.linalg.norm(a - b)
# Cosine similarity
cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
| Opération |
NumPy |
Dimension |
| Dot product |
np.dot(a, b) |
(n,) × (n,) → scalaire |
| Matrix multiply |
A @ B |
(m,n) × (n,p) → (m,p) |
| Element-wise |
A * B |
(m,n) × (m,n) → (m,n) |
| Outer product |
np.outer(a, b) |
(m,) × (n,) → (m,n) |
📐 Dérivées & Gradient
Chain Rule
Règle de la chaîne: d/dx[f(g(x))] = f'(g(x)) · g'(x)
# Exemple: y = (x² + 1)³
# f(u) = u³, g(x) = x² + 1
# dy/dx = 3u² · 2x = 3(x² + 1)² · 2x
import torch
x = torch.tensor(2.0, requires_grad=True)
y = (x**2 + 1)**3
y.backward()
print(f"dy/dx = {x.grad}") # 180.0
# Multi-variable chain rule
x = torch.tensor(1.0, requires_grad=True)
y = torch.tensor(2.0, requires_grad=True)
z = x**2 + y**2
w = torch.exp(z)
w.backward()
print(f"dw/dx = {x.grad}") # 2x·exp(x²+y²)
print(f"dw/dy = {y.grad}") # 2y·exp(x²+y²)
Gradient Descent
Mise à jour: θ = θ - α∇J(θ)
α = learning rate, ∇J = gradient de la loss
# Gradient Descent simple
def gradient_descent(X, y, learning_rate=0.01, epochs=1000):
m, n = X.shape
theta = np.zeros(n)
for epoch in range(epochs):
# Forward pass
predictions = X @ theta
# Compute gradient
gradient = (1/m) * X.T @ (predictions - y)
# Update parameters
theta = theta - learning_rate * gradient
# Compute loss
loss = (1/(2*m)) * np.sum((predictions - y)**2)
if epoch % 100 == 0:
print(f"Epoch {epoch}: Loss = {loss:.4f}")
return theta
# Avec PyTorch
import torch.optim as optim
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)
for epoch in range(100):
predictions = model(X)
loss = criterion(predictions, y)
optimizer.zero_grad() # Reset gradients
loss.backward() # Compute gradients
optimizer.step() # Update parameters
Learning Rate
# Learning rate trop grand: divergence
# Learning rate trop petit: convergence lente
# Stratégies learning rate
from torch.optim.lr_scheduler import *
# Step decay
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
# Exponential decay
scheduler = ExponentialLR(optimizer, gamma=0.95)
# Cosine annealing
scheduler = CosineAnnealingLR(optimizer, T_max=100)
# Reduce on plateau
scheduler = ReduceLROnPlateau(optimizer, mode='min',
factor=0.1, patience=10)
# Utilisation
for epoch in range(100):
train(...)
val_loss = validate(...)
scheduler.step(val_loss) # ou scheduler.step() selon type
Dérivées Courantes
| Fonction |
Dérivée |
| xn |
nxn-1 |
| ex |
ex |
| ln(x) |
1/x |
| sin(x) |
cos(x) |
| cos(x) |
-sin(x) |
| 1/(1+e-x) (sigmoid) |
σ(x)(1-σ(x)) |
| tanh(x) |
1 - tanh²(x) |
⚠️ Attention: Toujours normaliser les données avant l'entraînement. Cela aide le gradient descent à converger plus rapidement.
🎲 Probabilités
Théorème de Bayes
Bayes: P(A|B) = P(B|A)P(A) / P(B)
# Exemple: Diagnostic médical
# P(Maladie|Test+) = P(Test+|Maladie) × P(Maladie) / P(Test+)
def bayes_theorem(p_b_given_a, p_a, p_b):
"""Calcule P(A|B)"""
return (p_b_given_a * p_a) / p_b
# Sensibilité test: 95%, Prévalence: 1%, Spécificité: 90%
p_test_pos_given_sick = 0.95 # VP
p_sick = 0.01
p_test_pos_given_healthy = 0.10 # FP
p_healthy = 0.99
p_test_pos = (p_test_pos_given_sick * p_sick +
p_test_pos_given_healthy * p_healthy)
p_sick_given_test_pos = bayes_theorem(
p_test_pos_given_sick, p_sick, p_test_pos
)
print(f"P(Malade|Test+) = {p_sick_given_test_pos:.2%}") # ~8.8%
Distributions
import numpy as np
from scipy import stats
# Distribution Normale (Gaussienne)
mu, sigma = 0, 1
normal = stats.norm(mu, sigma)
samples = normal.rvs(size=1000)
pdf = normal.pdf(x) # Probability Density Function
cdf = normal.cdf(x) # Cumulative Distribution Function
# Distribution Bernoulli (binaire)
p = 0.7
bernoulli = stats.bernoulli(p)
samples = bernoulli.rvs(size=100) # 0 ou 1
# Distribution Binomiale
n, p = 10, 0.5
binomial = stats.binom(n, p)
samples = binomial.rvs(size=1000)
# Distribution Uniforme
uniform = stats.uniform(loc=0, scale=1) # entre 0 et 1
samples = uniform.rvs(size=1000)
# Distribution Exponentielle
lambda_param = 1.5
exponential = stats.expon(scale=1/lambda_param)
samples = exponential.rvs(size=1000)
Softmax
Softmax: σ(z)ᵢ = ezᵢ / Σⱼ ezⱼ
def softmax(x):
"""Compute softmax (stable version)"""
# Soustraire max pour stabilité numérique
exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
# Exemple
logits = np.array([2.0, 1.0, 0.1])
probs = softmax(logits)
print(probs) # [0.659, 0.242, 0.099]
print(np.sum(probs)) # 1.0
# PyTorch
import torch.nn.functional as F
logits = torch.tensor([2.0, 1.0, 0.1])
probs = F.softmax(logits, dim=0)
# Log softmax (plus stable pour calculs)
log_probs = F.log_softmax(logits, dim=0)
Statistiques de Base
# Moyenne, médiane, écart-type
data = np.random.randn(1000)
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
variance = np.var(data)
# Quantiles
q25, q50, q75 = np.percentile(data, [25, 50, 75])
# Corrélation
x = np.random.randn(100)
y = 2*x + np.random.randn(100)*0.5
correlation = np.corrcoef(x, y)[0, 1]
# Covariance
covariance = np.cov(x, y)[0, 1]
| Distribution |
Paramètres |
Usage ML |
| Normale |
μ (moyenne), σ (écart-type) |
Initialisation poids, bruit |
| Bernoulli |
p (probabilité succès) |
Classification binaire |
| Uniforme |
a (min), b (max) |
Initialisation poids |
| Exponentielle |
λ (rate) |
Temps d'attente, durées |
📊 Entropie & Information
Entropie de Shannon
Entropie: H(X) = -Σ P(x) log P(x)
def entropy(probs):
"""Calcule l'entropie de Shannon"""
# Éviter log(0)
probs = probs[probs > 0]
return -np.sum(probs * np.log2(probs))
# Exemple
# Distribution uniforme: haute entropie
uniform_probs = np.array([0.25, 0.25, 0.25, 0.25])
print(f"Entropy (uniform): {entropy(uniform_probs):.2f} bits") # 2.0
# Distribution concentrée: faible entropie
peaked_probs = np.array([0.9, 0.05, 0.03, 0.02])
print(f"Entropy (peaked): {entropy(peaked_probs):.2f} bits") # ~0.64
Cross-Entropy
Cross-Entropy: H(P,Q) = -Σ P(x) log Q(x)
def cross_entropy(y_true, y_pred):
"""Cross-entropy entre vraie distribution et prédiction"""
return -np.sum(y_true * np.log(y_pred + 1e-10))
# Exemple classification
y_true = np.array([0, 1, 0]) # One-hot classe 1
y_pred = np.array([0.1, 0.8, 0.1]) # Prédiction
ce = cross_entropy(y_true, y_pred)
print(f"Cross-Entropy: {ce:.4f}") # 0.2231
# PyTorch CrossEntropyLoss
import torch.nn as nn
criterion = nn.CrossEntropyLoss()
logits = torch.tensor([[2.0, 1.0, 0.1]]) # Avant softmax
target = torch.tensor([0]) # Classe 0
loss = criterion(logits, target)
# Binary Cross-Entropy
bce = nn.BCELoss()
predictions = torch.tensor([0.8, 0.3, 0.9])
targets = torch.tensor([1.0, 0.0, 1.0])
loss = bce(predictions, targets)
KL Divergence
KL Divergence: DKL(P||Q) = Σ P(x) log(P(x)/Q(x))
def kl_divergence(p, q):
"""Divergence KL de P vers Q"""
return np.sum(p * np.log(p / (q + 1e-10) + 1e-10))
# Exemple
p = np.array([0.4, 0.3, 0.3])
q = np.array([0.3, 0.4, 0.3])
kl = kl_divergence(p, q)
print(f"KL(P||Q): {kl:.4f}")
# KL n'est PAS symétrique
kl_reverse = kl_divergence(q, p)
print(f"KL(Q||P): {kl_reverse:.4f}")
# PyTorch KLDivLoss
kl_loss = nn.KLDivLoss(reduction='batchmean')
# Input doit être log-probabilities
log_q = torch.log(torch.tensor(q))
p_tensor = torch.tensor(p)
loss = kl_loss(log_q, p_tensor)
Perplexity
Perplexity: PP(P) = 2H(P) = 2-Σ P(x) log P(x)
def perplexity(probs):
"""Calcule perplexity d'une distribution"""
return 2 ** entropy(probs)
# Exemple: Language Model
# Perplexity mesure combien le modèle est "perplexe"
vocab_size = 10000
# Modèle uniforme (très mauvais)
uniform = np.ones(vocab_size) / vocab_size
pp_uniform = perplexity(uniform)
print(f"Perplexity (uniform): {pp_uniform:.0f}") # 10000
# Bon modèle (concentré)
good_model = np.zeros(vocab_size)
good_model[:100] = 0.01 # Masse sur 100 mots
pp_good = perplexity(good_model)
print(f"Perplexity (good): {pp_good:.0f}") # ~100
| Métrique |
Formule |
Interprétation |
| Entropy |
-Σ P log P |
Incertitude moyenne |
| Cross-Entropy |
-Σ P log Q |
Loss classification |
| KL Divergence |
Σ P log(P/Q) |
Distance entre distributions |
| Perplexity |
2H(P) |
Équivalent branching factor |
💡 Intuition: Cross-Entropy = Entropy + KL Divergence. En minimisant la cross-entropy, on minimise la divergence KL entre prédictions et vraie distribution.
⚡ Optimiseurs
SGD (Stochastic Gradient Descent)
SGD: θ = θ - α∇J(θ)
SGD + Momentum: v = βv + ∇J(θ); θ = θ - αv
import torch.optim as optim
# SGD basique
optimizer = optim.SGD(model.parameters(), lr=0.01)
# SGD avec momentum
optimizer = optim.SGD(model.parameters(), lr=0.01,
momentum=0.9)
# SGD avec momentum et weight decay
optimizer = optim.SGD(model.parameters(), lr=0.01,
momentum=0.9, weight_decay=1e-4)
# Nesterov momentum (accélération)
optimizer = optim.SGD(model.parameters(), lr=0.01,
momentum=0.9, nesterov=True)
Adam & AdamW
Adam:
m = β₁m + (1-β₁)∇J (moment 1er ordre)
v = β₂v + (1-β₂)∇J² (moment 2e ordre)
θ = θ - α·m̂/√(v̂ + ε)
# Adam (le plus populaire)
optimizer = optim.Adam(model.parameters(), lr=0.001,
betas=(0.9, 0.999), eps=1e-8)
# AdamW (meilleur pour transformers)
# Découple weight decay du gradient
optimizer = optim.AdamW(model.parameters(), lr=0.001,
betas=(0.9, 0.999),
weight_decay=0.01)
# Valeurs typiques:
# - lr: 1e-3 to 1e-4
# - beta1: 0.9
# - beta2: 0.999
# - weight_decay: 0.01 (AdamW) ou 0 (Adam)
Autres Optimiseurs
# RMSprop (bon pour RNNs)
optimizer = optim.RMSprop(model.parameters(), lr=0.01,
alpha=0.99, eps=1e-8)
# Adagrad (rare aujourd'hui)
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
# Adadelta
optimizer = optim.Adadelta(model.parameters(), lr=1.0)
# LAMB (pour très gros batches)
# pip install pytorch-lamb
from pytorch_lamb import Lamb
optimizer = Lamb(model.parameters(), lr=0.001)
Learning Rate Scheduling
from torch.optim.lr_scheduler import *
# Linear warmup puis cosine decay
def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps,
num_training_steps):
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / \
float(max(1, num_training_steps - num_warmup_steps))
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
return LambdaLR(optimizer, lr_lambda)
# One Cycle Policy (très efficace)
scheduler = OneCycleLR(optimizer, max_lr=0.01,
steps_per_epoch=len(train_loader),
epochs=epochs)
# Training loop avec scheduler
for epoch in range(epochs):
for batch in train_loader:
optimizer.zero_grad()
loss = model(batch)
loss.backward()
optimizer.step()
scheduler.step() # Appel après chaque batch
| Optimiseur |
LR Typique |
Cas d'usage |
| SGD + Momentum |
0.01 - 0.1 |
Vision (ResNet, etc.) |
| Adam |
1e-3 - 1e-4 |
Usage général |
| AdamW |
1e-3 - 1e-5 |
Transformers, NLP |
| RMSprop |
1e-3 - 1e-4 |
RNNs, Reinforcement Learning |
🎯 Conseil: Pour transformers, utilisez AdamW avec warmup. Pour CNN, SGD + momentum converge souvent mieux qu'Adam.
🔬 Scikit-learn
Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
# Créer pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=0.95)), # Garde 95% variance
('classifier', RandomForestClassifier(n_estimators=100))
])
# Fit & predict
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
# Accéder aux étapes
scaler = pipeline.named_steps['scaler']
pca = pipeline.named_steps['pca']
GridSearchCV
from sklearn.model_selection import GridSearchCV
# Définir grille de paramètres
param_grid = {
'pca__n_components': [10, 20, 50],
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [5, 10, None],
'classifier__min_samples_split': [2, 5, 10]
}
# GridSearch
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=5, # 5-fold cross-validation
scoring='f1_weighted',
n_jobs=-1, # Parallélisation
verbose=2
)
grid_search.fit(X_train, y_train)
# Meilleurs paramètres
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
# Meilleur modèle
best_model = grid_search.best_estimator_
Train-Test Split
from sklearn.model_selection import train_test_split
# Split basique
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Stratified split (garde proportions classes)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# Split en 3: train/val/test
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=42
) # 70% train, 15% val, 15% test
Métriques Courantes
from sklearn.metrics import *
# Classification
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
# Rapport complet
print(classification_report(y_true, y_pred))
# Matrice de confusion
cm = confusion_matrix(y_true, y_pred)
# ROC-AUC
roc_auc = roc_auc_score(y_true, y_proba, multi_class='ovr')
# Régression
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
Preprocessing
from sklearn.preprocessing import *
# Standardization (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Min-Max scaling (range [0,1])
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X_train)
# Robust scaling (robuste aux outliers)
robust = RobustScaler()
X_robust = robust.fit_transform(X_train)
# Label encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# One-hot encoding
ohe = OneHotEncoder(sparse=False)
X_onehot = ohe.fit_transform(X_categorical)
📈 Évaluation Modèles
Precision, Recall, F1
Precision: TP / (TP + FP)
Recall: TP / (TP + FN)
F1: 2 × (Precision × Recall) / (Precision + Recall)
from sklearn.metrics import precision_recall_fscore_support
# Calcul des métriques
precision, recall, f1, support = precision_recall_fscore_support(
y_true, y_pred, average='weighted'
)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
# Par classe
precision, recall, f1, support = precision_recall_fscore_support(
y_true, y_pred, average=None
)
for i, (p, r, f, s) in enumerate(zip(precision, recall, f1, support)):
print(f"Class {i}: P={p:.3f}, R={r:.3f}, F1={f:.3f}, n={s}")
Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Calculer matrice
cm = confusion_matrix(y_true, y_pred)
# Visualiser
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=class_names)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
# Métriques depuis confusion matrix
# Pour binaire (2x2)
TN, FP, FN, TP = cm.ravel()
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN) # = sensitivity
specificity = TN / (TN + FP)
f1 = 2 * (precision * recall) / (precision + recall)
ROC & AUC
from sklearn.metrics import roc_curve, roc_auc_score, auc
# Classification binaire
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,
label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
# Multi-class ROC
from sklearn.preprocessing import label_binarize
y_bin = label_binarize(y_true, classes=[0, 1, 2])
n_classes = y_bin.shape[1]
# Compute ROC per class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_scores[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
Cross-Validation
from sklearn.model_selection import cross_val_score, cross_validate
# Cross-validation simple
scores = cross_val_score(model, X, y, cv=5, scoring='f1_weighted')
print(f"F1 scores: {scores}")
print(f"Mean: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
# Multiple métriques
scoring = ['precision_weighted', 'recall_weighted', 'f1_weighted']
scores = cross_validate(model, X, y, cv=5, scoring=scoring)
print(f"Precision: {scores['test_precision_weighted'].mean():.4f}")
print(f"Recall: {scores['test_recall_weighted'].mean():.4f}")
print(f"F1: {scores['test_f1_weighted'].mean():.4f}")
# Stratified K-Fold
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=skf)
| Métrique |
Quand l'utiliser |
| Accuracy |
Classes équilibrées |
| Precision |
Minimiser faux positifs (spam, fraude) |
| Recall |
Minimiser faux négatifs (maladie) |
| F1 |
Balance precision/recall, classes déséquilibrées |
| AUC-ROC |
Comparer modèles, indépendant du seuil |
⚠️ Classes déséquilibrées: N'utilisez pas accuracy! Préférez F1-score, AUC-ROC, ou precision/recall selon votre cas d'usage.
🧠 Réseaux de Neurones
Perceptron
Perceptron: y = σ(w·x + b)
w = poids, b = biais, σ = fonction d'activation
import numpy as np
class Perceptron:
def __init__(self, n_inputs):
self.weights = np.random.randn(n_inputs)
self.bias = 0
def predict(self, X):
"""Forward pass"""
z = np.dot(X, self.weights) + self.bias
return self.activation(z)
def activation(self, z):
"""Step function"""
return np.where(z >= 0, 1, 0)
def fit(self, X, y, epochs=100, lr=0.01):
"""Training avec règle de Hebb"""
for epoch in range(epochs):
for xi, target in zip(X, y):
prediction = self.predict(xi)
error = target - prediction
self.weights += lr * error * xi
self.bias += lr * error
Fonctions d'Activation
import torch
import torch.nn as nn
# Sigmoid: σ(x) = 1 / (1 + e^(-x))
sigmoid = nn.Sigmoid()
# Sortie: (0, 1), utilisé pour probabilités binaires
# Tanh: tanh(x) = (e^x - e^(-x)) / (e^x + e^(-x))
tanh = nn.Tanh()
# Sortie: (-1, 1), centré sur 0
# ReLU: max(0, x)
relu = nn.ReLU()
# Sortie: [0, ∞), le plus populaire
# Leaky ReLU: max(0.01x, x)
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
# Résout problème "dying ReLU"
# ELU: x si x > 0, α(e^x - 1) sinon
elu = nn.ELU(alpha=1.0)
# GELU: x·Φ(x) (Gaussian Error Linear Unit)
gelu = nn.GELU()
# Utilisé dans BERT, GPT
# Softmax: e^xi / Σe^xj
softmax = nn.Softmax(dim=-1)
# Pour classification multi-classe
Backpropagation
Chain rule: ∂L/∂w = ∂L/∂y · ∂y/∂z · ∂z/∂w
# Backprop manuel (éducatif)
class SimpleNet:
def __init__(self, input_size, hidden_size, output_size):
# Initialisation
self.W1 = np.random.randn(input_size, hidden_size) * 0.01
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * 0.01
self.b2 = np.zeros((1, output_size))
def forward(self, X):
# Forward pass
self.z1 = X @ self.W1 + self.b1
self.a1 = np.maximum(0, self.z1) # ReLU
self.z2 = self.a1 @ self.W2 + self.b2
self.a2 = 1 / (1 + np.exp(-self.z2)) # Sigmoid
return self.a2
def backward(self, X, y, output, lr=0.01):
m = X.shape[0]
# Backward pass
dz2 = output - y
dW2 = (1/m) * self.a1.T @ dz2
db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
da1 = dz2 @ self.W2.T
dz1 = da1 * (self.z1 > 0) # ReLU derivative
dW1 = (1/m) * X.T @ dz1
db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
# Update weights
self.W1 -= lr * dW1
self.b1 -= lr * db1
self.W2 -= lr * dW2
self.b2 -= lr * db2
MLP avec PyTorch
class MLP(nn.Module):
def __init__(self, input_size, hidden_sizes, output_size):
super().__init__()
layers = []
prev_size = input_size
# Hidden layers
for hidden_size in hidden_sizes:
layers.append(nn.Linear(prev_size, hidden_size))
layers.append(nn.ReLU())
layers.append(nn.Dropout(0.2))
prev_size = hidden_size
# Output layer
layers.append(nn.Linear(prev_size, output_size))
self.network = nn.Sequential(*layers)
def forward(self, x):
return self.network(x)
# Créer modèle
model = MLP(input_size=784,
hidden_sizes=[512, 256, 128],
output_size=10)
| Activation |
Range |
Usage |
| Sigmoid |
(0, 1) |
Classification binaire (sortie) |
| Tanh |
(-1, 1) |
Hidden layers (rare maintenant) |
| ReLU |
[0, ∞) |
Hidden layers (standard) |
| Leaky ReLU |
(-∞, ∞) |
Éviter dying ReLU |
| GELU |
(-∞, ∞) |
Transformers |
| Softmax |
Σ = 1 |
Multi-classe (sortie) |
🔥 PyTorch Essentials
Tensors
import torch
# Créer tensors
x = torch.tensor([1, 2, 3])
y = torch.zeros(3, 4)
z = torch.ones(2, 3)
rand = torch.randn(5, 5) # Distribution normale
# Opérations
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])
c = a + b # Element-wise
d = a * b # Element-wise
e = torch.dot(a, b) # Dot product
# Matrix operations
A = torch.randn(3, 4)
B = torch.randn(4, 5)
C = A @ B # Matrix multiply (3, 5)
C = torch.matmul(A, B) # Équivalent
# Reshape
x = torch.arange(12)
x = x.view(3, 4) # Reshape
x = x.view(-1, 2) # Auto-infer: (6, 2)
x = x.reshape(3, 4) # Alternatif
# Device (CPU/GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = x.to(device)
y = torch.randn(3, 4, device=device) # Créer direct sur GPU
Autograd
# Autograd: différentiation automatique
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)
# Forward pass
z = x**2 + y**3
w = z * 2
# Backward pass
w.backward()
# Gradients
print(f"dw/dx = {x.grad}") # 8.0 (= 2 * 2x)
print(f"dw/dy = {y.grad}") # 54.0 (= 2 * 3y²)
# Réinitialiser gradients
x.grad.zero_()
y.grad.zero_()
# No grad (pour inference)
with torch.no_grad():
predictions = model(x_test)
# Ou
@torch.no_grad()
def evaluate(model, data):
return model(data)
nn.Module
import torch.nn as nn
class MyModel(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
# Définir layers
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.2)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
"""Forward pass"""
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
# Créer modèle
model = MyModel(input_dim=784, hidden_dim=256, output_dim=10)
# Déplacer sur GPU
model = model.to(device)
# Voir paramètres
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters()
if p.requires_grad)
DataLoader
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
def __init__(self, data, labels):
self.data = data
self.labels = labels
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx], self.labels[idx]
# Créer dataset
dataset = CustomDataset(X_train, y_train)
# Créer dataloader
dataloader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=4, # Parallélisation
pin_memory=True # Optimisation GPU
)
# Itération
for batch_x, batch_y in dataloader:
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
# Training...
Training Loop
import torch.optim as optim
# Setup
model = MyModel(784, 256, 10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
model.train() # Mode training
train_loss = 0
for batch_x, batch_y in train_loader:
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
# Forward pass
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
# Validation
model.eval() # Mode evaluation
val_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_x, batch_y in val_loader:
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
val_loss += loss.item()
_, predicted = outputs.max(1)
total += batch_y.size(0)
correct += predicted.eq(batch_y).sum().item()
accuracy = 100. * correct / total
print(f"Epoch {epoch+1}: "
f"Train Loss: {train_loss/len(train_loader):.4f}, "
f"Val Loss: {val_loss/len(val_loader):.4f}, "
f"Val Acc: {accuracy:.2f}%")
⚡ Performance: Utilisez pin_memory=True dans DataLoader et .to(device, non_blocking=True) pour transferts GPU plus rapides.
📉 Loss Functions
Mean Squared Error (MSE)
MSE: L = (1/n) Σ (yᵢ - ŷᵢ)²
import torch.nn as nn
# MSE Loss (régression)
mse_loss = nn.MSELoss()
predictions = torch.tensor([2.5, 3.7, 1.2])
targets = torch.tensor([3.0, 4.0, 1.0])
loss = mse_loss(predictions, targets)
# MAE Loss (plus robuste aux outliers)
mae_loss = nn.L1Loss()
loss = mae_loss(predictions, targets)
# Huber Loss (combine MSE et MAE)
huber_loss = nn.SmoothL1Loss()
loss = huber_loss(predictions, targets)
Cross-Entropy Loss
CE: L = -Σ yᵢ log(ŷᵢ)
Pour classification: L = -log(ŷclasse_vraie)
# CrossEntropyLoss (classification multi-classe)
# Combine LogSoftmax + NLLLoss
ce_loss = nn.CrossEntropyLoss()
# Input: logits (avant softmax), shape (batch, num_classes)
logits = torch.tensor([[2.0, 1.0, 0.1],
[0.5, 2.5, 0.3]])
# Target: indices de classe, shape (batch,)
targets = torch.tensor([0, 1])
loss = ce_loss(logits, targets)
# Avec poids de classe (pour déséquilibre)
class_weights = torch.tensor([1.0, 2.0, 1.5])
weighted_ce = nn.CrossEntropyLoss(weight=class_weights)
# Label smoothing (régularisation)
ce_smoothed = nn.CrossEntropyLoss(label_smoothing=0.1)
Binary Cross-Entropy (BCE)
BCE: L = -[y log(ŷ) + (1-y) log(1-ŷ)]
# BCELoss (classification binaire)
# Input doit être entre 0 et 1 (après sigmoid)
bce_loss = nn.BCELoss()
predictions = torch.tensor([0.8, 0.3, 0.9])
targets = torch.tensor([1.0, 0.0, 1.0])
loss = bce_loss(predictions, targets)
# BCEWithLogitsLoss (plus stable numériquement)
# Input: logits (avant sigmoid)
bce_logits = nn.BCEWithLogitsLoss()
logits = torch.tensor([1.5, -0.8, 2.1])
targets = torch.tensor([1.0, 0.0, 1.0])
loss = bce_logits(logits, targets)
# Multi-label classification
# (plusieurs classes peuvent être vraies)
predictions = torch.tensor([[0.8, 0.3, 0.9],
[0.2, 0.7, 0.4]])
targets = torch.tensor([[1.0, 0.0, 1.0],
[0.0, 1.0, 0.0]])
loss = bce_loss(predictions, targets)
Autres Loss Functions
# NLLLoss (Negative Log Likelihood)
# Input doit être log-probabilities
nll_loss = nn.NLLLoss()
log_probs = torch.log_softmax(logits, dim=1)
loss = nll_loss(log_probs, targets)
# KLDivLoss (Divergence KL)
kl_loss = nn.KLDivLoss(reduction='batchmean')
# Input: log-probabilities, Target: probabilities
loss = kl_loss(log_probs, target_probs)
# CosineEmbeddingLoss (pour embeddings)
cosine_loss = nn.CosineEmbeddingLoss()
embedding1 = torch.randn(32, 128)
embedding2 = torch.randn(32, 128)
target = torch.ones(32) # 1 si similaire, -1 si différent
loss = cosine_loss(embedding1, embedding2, target)
# TripletMarginLoss (pour metric learning)
triplet_loss = nn.TripletMarginLoss(margin=1.0)
anchor = torch.randn(32, 128)
positive = torch.randn(32, 128) # Même classe
negative = torch.randn(32, 128) # Classe différente
loss = triplet_loss(anchor, positive, negative)
| Loss |
Task |
Input |
Target |
| MSE |
Régression |
Valeurs réelles |
Valeurs réelles |
| CrossEntropy |
Multi-classe |
Logits |
Indices classe |
| BCE |
Binaire |
Probabilities [0,1] |
0 ou 1 |
| BCEWithLogits |
Binaire |
Logits |
0 ou 1 |
| NLL |
Multi-classe |
Log-probabilities |
Indices classe |
💡 Astuce: Utilisez toujours BCEWithLogitsLoss au lieu de Sigmoid + BCELoss, et CrossEntropyLoss au lieu de Softmax + NLLLoss. Plus stable numériquement!
🖼️ CNN Architecture
Conv2d Parameters
Output size: O = ⌊(W - K + 2P) / S⌋ + 1
W=input, K=kernel, P=padding, S=stride
import torch.nn as nn
# Convolution 2D
conv = nn.Conv2d(
in_channels=3, # RGB
out_channels=64, # Nombre de filtres
kernel_size=3, # 3x3
stride=1,
padding=1, # 'same' padding
bias=True
)
# Exemple calcul taille sortie
# Input: (batch, 3, 32, 32)
# Output: (batch, 64, 32, 32)
# Car: (32 - 3 + 2*1) / 1 + 1 = 32
# Depthwise Separable Convolution (MobileNet)
depthwise = nn.Conv2d(64, 64, kernel_size=3,
padding=1, groups=64)
pointwise = nn.Conv2d(64, 128, kernel_size=1)
# Dilated/Atrous Convolution
dilated = nn.Conv2d(64, 64, kernel_size=3,
padding=2, dilation=2)
Pooling Layers
# Max Pooling (le plus commun)
maxpool = nn.MaxPool2d(
kernel_size=2, # 2x2
stride=2 # Réduit taille par 2
)
# Input: (batch, 64, 32, 32)
# Output: (batch, 64, 16, 16)
# Average Pooling
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
# Adaptive Average Pooling (taille output fixe)
adaptive = nn.AdaptiveAvgPool2d((7, 7))
# Input: (batch, 512, 14, 14)
# Output: (batch, 512, 7, 7)
# Global Average Pooling
global_pool = nn.AdaptiveAvgPool2d((1, 1))
# Input: (batch, 512, 7, 7)
# Output: (batch, 512, 1, 1)
CNN Simple
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.features = nn.Sequential(
# Block 1
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
# Block 2
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
# Block 3
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
)
self.classifier = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten(),
nn.Linear(256, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
ResNet Block
class ResidualBlock(nn.Module):
"""Bloc résiduel avec skip connection"""
def __init__(self, in_channels, out_channels, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels,
kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels,
kernel_size=3, stride=1,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# Shortcut
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
# Skip connection
out += self.shortcut(identity)
out = self.relu(out)
return out
| Layer |
Paramètres |
Usage |
| Conv2d |
in_ch, out_ch, kernel, stride, padding |
Extraction features |
| MaxPool2d |
kernel, stride |
Downsampling |
| BatchNorm2d |
num_features |
Normalisation, stabilité |
| Dropout2d |
p (prob drop) |
Régularisation |
| AdaptiveAvgPool2d |
output_size |
Pooling taille fixe |
🎨 Architecture CNN: Pattern typique: Conv → BatchNorm → ReLU → Pool. Doubler les canaux quand on divise la résolution par 2.
🔄 RNN/LSTM/GRU
RNN Simple
RNN: ht = tanh(Whhht-1 + Wxhxt + b)
import torch.nn as nn
# RNN basique
rnn = nn.RNN(
input_size=100, # Dimension input
hidden_size=256, # Dimension hidden state
num_layers=2, # Nombre couches empilées
batch_first=True, # Input shape: (batch, seq, features)
dropout=0.2, # Dropout entre couches
bidirectional=False
)
# Forward pass
x = torch.randn(32, 10, 100) # (batch, seq_len, input_size)
h0 = torch.zeros(2, 32, 256) # (num_layers, batch, hidden_size)
output, hn = rnn(x, h0)
# output: (32, 10, 256) - output à chaque timestep
# hn: (2, 32, 256) - dernier hidden state
# RNN Cell (manuel)
rnn_cell = nn.RNNCell(input_size=100, hidden_size=256)
h = torch.zeros(32, 256)
outputs = []
for t in range(10):
h = rnn_cell(x[:, t, :], h)
outputs.append(h)
LSTM
LSTM gates:
ft = σ(Wf·[ht-1, xt]) (forget)
it = σ(Wi·[ht-1, xt]) (input)
ot = σ(Wo·[ht-1, xt]) (output)
# LSTM (le plus utilisé pour séquences)
lstm = nn.LSTM(
input_size=100,
hidden_size=256,
num_layers=2,
batch_first=True,
dropout=0.2,
bidirectional=True # Bi-LSTM
)
# Forward pass
x = torch.randn(32, 10, 100)
h0 = torch.zeros(2*2, 32, 256) # *2 car bidirectionnel
c0 = torch.zeros(2*2, 32, 256) # Cell state
output, (hn, cn) = lstm(x, (h0, c0))
# output: (32, 10, 512) - 512 car bidirectionnel
# hn: (4, 32, 256) - hidden state
# cn: (4, 32, 256) - cell state
# Exemple: Sequence Classification
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.lstm = nn.LSTM(embed_dim, hidden_dim,
batch_first=True, bidirectional=True)
self.fc = nn.Linear(hidden_dim * 2, num_classes)
def forward(self, x):
# x: (batch, seq_len)
embedded = self.embedding(x) # (batch, seq_len, embed_dim)
output, (hn, cn) = self.lstm(embedded)
# Utiliser dernier hidden state
# Concatener forward et backward
hidden = torch.cat([hn[-2], hn[-1]], dim=1)
logits = self.fc(hidden)
return logits
GRU
GRU: Plus simple que LSTM, souvent aussi bon
rt = σ(Wr·[ht-1, xt]) (reset)
zt = σ(Wz·[ht-1, xt]) (update)
# GRU (plus rapide que LSTM)
gru = nn.GRU(
input_size=100,
hidden_size=256,
num_layers=2,
batch_first=True,
dropout=0.2,
bidirectional=True
)
# Forward pass
x = torch.randn(32, 10, 100)
h0 = torch.zeros(2*2, 32, 256)
output, hn = gru(x, h0)
# Pas de cell state (plus simple que LSTM)
Quand utiliser chaque type
| Type |
Avantages |
Quand utiliser |
| RNN |
Simple, rapide |
Séquences courtes, baseline |
| LSTM |
Gère dépendances longues, stable |
Séquences longues, standard NLP |
| GRU |
Plus rapide que LSTM, moins de paramètres |
Alternative à LSTM, contraintes compute |
| Bi-LSTM/GRU |
Context bidirectionnel |
Classification, pas génération |
Handling Sequences
# Padding sequences (différentes longueurs)
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
# Sequences de longueurs variables
seqs = [torch.randn(5, 100),
torch.randn(8, 100),
torch.randn(3, 100)]
# Padding
padded = pad_sequence(seqs, batch_first=True)
# Shape: (3, 8, 100) - paddé à max length
# Pack (efficace pour training)
lengths = torch.tensor([5, 8, 3])
packed = pack_padded_sequence(padded, lengths,
batch_first=True,
enforce_sorted=False)
# Forward avec packed
output, hn = lstm(packed)
# Unpack
output, lengths = pad_packed_sequence(output, batch_first=True)
⚠️ Vanishing Gradient: RNN basiques souffrent de vanishing gradient sur longues séquences. LSTM/GRU résolvent ce problème. Pour séquences très longues (>500), considérez Transformer.
🤖 Transformer Basics
Self-Attention
Attention: Attention(Q,K,V) = softmax(QKT/√dk)V
Q=queries, K=keys, V=values, dk=dimension
import torch
import torch.nn as nn
import math
class SelfAttention(nn.Module):
def __init__(self, embed_dim):
super().__init__()
self.embed_dim = embed_dim
# Projections Q, K, V
self.query = nn.Linear(embed_dim, embed_dim)
self.key = nn.Linear(embed_dim, embed_dim)
self.value = nn.Linear(embed_dim, embed_dim)
self.scale = math.sqrt(embed_dim)
def forward(self, x, mask=None):
# x: (batch, seq_len, embed_dim)
batch_size, seq_len, _ = x.shape
# Compute Q, K, V
Q = self.query(x) # (batch, seq_len, embed_dim)
K = self.key(x)
V = self.value(x)
# Attention scores
scores = torch.bmm(Q, K.transpose(1, 2)) # (batch, seq_len, seq_len)
scores = scores / self.scale
# Apply mask (pour padding ou causal)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
# Attention weights
attn_weights = torch.softmax(scores, dim=-1)
# Apply attention à values
output = torch.bmm(attn_weights, V) # (batch, seq_len, embed_dim)
return output, attn_weights
Multi-Head Attention
Multi-Head: Concat(head₁, ..., headh)WO
où headi = Attention(QWiQ, KWiK, VWiV)
class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super().__init__()
assert embed_dim % num_heads == 0
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.qkv = nn.Linear(embed_dim, embed_dim * 3)
self.out = nn.Linear(embed_dim, embed_dim)
self.scale = math.sqrt(self.head_dim)
def forward(self, x, mask=None):
batch_size, seq_len, embed_dim = x.shape
# Compute Q, K, V pour tous les heads
qkv = self.qkv(x) # (batch, seq_len, 3*embed_dim)
qkv = qkv.reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim)
qkv = qkv.permute(2, 0, 3, 1, 4) # (3, batch, heads, seq_len, head_dim)
Q, K, V = qkv[0], qkv[1], qkv[2]
# Attention scores
scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attn = torch.softmax(scores, dim=-1)
# Apply attention
output = torch.matmul(attn, V) # (batch, heads, seq_len, head_dim)
# Concatenate heads
output = output.transpose(1, 2) # (batch, seq_len, heads, head_dim)
output = output.reshape(batch_size, seq_len, embed_dim)
output = self.out(output)
return output
# PyTorch built-in
mha = nn.MultiheadAttention(
embed_dim=512,
num_heads=8,
dropout=0.1,
batch_first=True
)
output, attn_weights = mha(x, x, x) # Self-attention
Positional Encoding
Position:
PE(pos, 2i) = sin(pos/100002i/d)
PE(pos, 2i+1) = cos(pos/100002i/d)
class PositionalEncoding(nn.Module):
def __init__(self, embed_dim, max_len=5000):
super().__init__()
# Créer matrice positional encoding
pe = torch.zeros(max_len, embed_dim)
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, embed_dim, 2).float() *
(-math.log(10000.0) / embed_dim))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0) # (1, max_len, embed_dim)
self.register_buffer('pe', pe)
def forward(self, x):
# x: (batch, seq_len, embed_dim)
seq_len = x.size(1)
x = x + self.pe[:, :seq_len, :]
return x
# Alternative: Learned positional embeddings
class LearnedPositionalEmbedding(nn.Module):
def __init__(self, max_len, embed_dim):
super().__init__()
self.embedding = nn.Embedding(max_len, embed_dim)
def forward(self, x):
batch_size, seq_len, _ = x.shape
positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
return x + self.embedding(positions)
Transformer Block
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
super().__init__()
# Multi-head attention
self.attention = nn.MultiheadAttention(
embed_dim, num_heads, dropout=dropout, batch_first=True
)
# Feed-forward
self.ff = nn.Sequential(
nn.Linear(embed_dim, ff_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(ff_dim, embed_dim),
nn.Dropout(dropout)
)
# Layer norm
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
# Multi-head attention + residual + norm
attn_output, _ = self.attention(x, x, x, attn_mask=mask)
x = self.norm1(x + self.dropout(attn_output))
# Feed-forward + residual + norm
ff_output = self.ff(x)
x = self.norm2(x + ff_output)
return x
# Transformer complet
class Transformer(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads,
num_layers, ff_dim, max_len, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.pos_encoding = PositionalEncoding(embed_dim, max_len)
self.layers = nn.ModuleList([
TransformerBlock(embed_dim, num_heads, ff_dim)
for _ in range(num_layers)
])
self.classifier = nn.Linear(embed_dim, num_classes)
def forward(self, x):
# x: (batch, seq_len)
x = self.embedding(x) # (batch, seq_len, embed_dim)
x = self.pos_encoding(x)
for layer in self.layers:
x = layer(x)
# Pooling (utiliser [CLS] token ou moyenne)
x = x.mean(dim=1) # (batch, embed_dim)
logits = self.classifier(x)
return logits
| Composant |
Rôle |
| Self-Attention |
Relations entre tokens |
| Multi-Head |
Différents aspects d'attention |
| Positional Encoding |
Information de position |
| Feed-Forward |
Transformation non-linéaire |
| Layer Norm |
Stabilisation training |
| Residual Connections |
Gradient flow, deep networks |
🚀 Transformers: Remplacent RNN/LSTM dans la plupart des tâches NLP. Parallélisables (vs RNN séquentiel) et capturent mieux les dépendances longues. Base de BERT, GPT, etc.