Cheatsheet - Phase 1: Fondations IA & Environnement

🔧 Setup Environnement IA

Créer un environnement Conda

# Créer environnement avec Python 3.10
conda create -n ia-env python=3.10
conda activate ia-env

# Installer PyTorch avec CUDA 11.8
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia

# Ou avec pip
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Vérifier CUDA

import torch

# Vérifier disponibilité CUDA
print(f"CUDA disponible: {torch.cuda.is_available()}")
print(f"Version CUDA: {torch.version.cuda}")
print(f"Nombre de GPUs: {torch.cuda.device_count()}")
print(f"GPU actuel: {torch.cuda.get_device_name(0)}")

# Tester calcul GPU
x = torch.rand(1000, 1000).cuda()
y = torch.rand(1000, 1000).cuda()
z = x @ y  # Multiplication sur GPU

WSL2 pour Windows

# Installation WSL2
wsl --install
wsl --set-default-version 2

# Installer Ubuntu
wsl --install -d Ubuntu-22.04

# Dans WSL, installer conda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh

# Installer CUDA Toolkit dans WSL
wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin
sudo mv cuda-wsl-ubuntu.pin /etc/apt/preferences.d/cuda-repository-pin-600

💡 Astuce: Utilisez conda env export > environment.yml pour sauvegarder votre environnement et le partager.

🐍 Python Essentials

Decorators

import time
from functools import wraps

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        print(f"{func.__name__} took {time.time()-start:.4f}s")
        return result
    return wrapper

@timer
def train_model(epochs):
    # Code d'entraînement
    time.sleep(2)

# Property decorator
class Model:
    def __init__(self):
        self._weights = None

    @property
    def weights(self):
        return self._weights

    @weights.setter
    def weights(self, value):
        self._weights = value

Generators

def data_generator(batch_size, num_batches):
    """Generator pour batches de données"""
    for i in range(num_batches):
        # Générer batch
        batch = torch.randn(batch_size, 784)
        labels = torch.randint(0, 10, (batch_size,))
        yield batch, labels

# Utilisation
for batch, labels in data_generator(32, 100):
    # Traiter batch
    pass

# Generator expression
squares = (x**2 for x in range(1000000))  # Économise mémoire

Type Hints

from typing import List, Dict, Tuple, Optional, Union
import numpy as np

def process_data(
    data: np.ndarray,
    labels: Optional[np.ndarray] = None,
    batch_size: int = 32
) -> Tuple[np.ndarray, Dict[str, float]]:
    """Traite les données et retourne résultats + métriques"""
    processed = data * 2.0
    metrics = {"mean": float(data.mean()), "std": float(data.std())}
    return processed, metrics

# Type pour modèle
from torch import nn
ModelType = nn.Module
OptimizerType = Union[torch.optim.Adam, torch.optim.SGD]

Comprehensions

# List comprehension
squares = [x**2 for x in range(10) if x % 2 == 0]

# Dict comprehension
metrics = {f"layer_{i}": torch.randn(10) for i in range(5)}

# Set comprehension
unique_labels = {label for batch in dataset for label in batch}

# Nested comprehension
matrix = [[i*j for j in range(5)] for i in range(5)]

🔢 NumPy Operations

Array Creation

import numpy as np

# Différentes façons de créer arrays
a = np.array([1, 2, 3, 4, 5])
b = np.zeros((3, 4))
c = np.ones((2, 3, 4))
d = np.arange(0, 10, 0.5)  # 0 à 10 par pas de 0.5
e = np.linspace(0, 1, 100)  # 100 valeurs entre 0 et 1
f = np.random.randn(5, 5)  # Distribution normale
g = np.eye(4)  # Matrice identité 4x4
h = np.full((3, 3), 7)  # Matrice remplie de 7

Reshaping

# Reshape
x = np.arange(12)
x_reshaped = x.reshape(3, 4)  # 3x4
x_auto = x.reshape(-1, 2)  # Calcul auto: 6x2

# Transpose
x_t = x_reshaped.T
x_swap = x.reshape(2, 3, 2).swapaxes(0, 1)

# Flatten
x_flat = x_reshaped.flatten()  # Copie
x_ravel = x_reshaped.ravel()  # Vue si possible

# Add dimension
x_expanded = x[np.newaxis, :]  # (1, 12)
x_expanded2 = x[:, np.newaxis]  # (12, 1)

Broadcasting

# Broadcasting examples
a = np.array([[1, 2, 3], [4, 5, 6]])  # (2, 3)
b = np.array([10, 20, 30])  # (3,)
c = a + b  # Broadcast b sur chaque ligne

# Normalisation avec broadcasting
X = np.random.randn(100, 10)
mean = X.mean(axis=0, keepdims=True)  # (1, 10)
std = X.std(axis=0, keepdims=True)    # (1, 10)
X_normalized = (X - mean) / std

# Broadcasting pour distance matrices
points = np.random.randn(50, 3)  # 50 points en 3D
distances = np.sqrt(((points[:, np.newaxis] - points) ** 2).sum(axis=2))

Einsum

# Einsum: notation Einstein pour opérations tensorielles

# Dot product
a = np.random.randn(5)
b = np.random.randn(5)
dot = np.einsum('i,i->', a, b)  # Équivalent à np.dot(a, b)

# Matrix multiplication
A = np.random.randn(3, 4)
B = np.random.randn(4, 5)
C = np.einsum('ij,jk->ik', A, B)  # A @ B

# Batch matrix multiplication
batch_A = np.random.randn(10, 3, 4)
batch_B = np.random.randn(10, 4, 5)
batch_C = np.einsum('bij,bjk->bik', batch_A, batch_B)

# Trace d'une matrice
trace = np.einsum('ii->', A)

# Outer product
outer = np.einsum('i,j->ij', a, b)

⚡ Performance: Utilisez toujours des opérations vectorisées plutôt que des boucles Python. NumPy est jusqu'à 100x plus rapide!

🐼 Pandas Essentials

Read & Write CSV

import pandas as pd

# Lecture CSV
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', sep=';', encoding='utf-8',
                 parse_dates=['date'], index_col=0)

# Lecture avec chunks pour gros fichiers
for chunk in pd.read_csv('huge_file.csv', chunksize=10000):
    process(chunk)

# Écriture CSV
df.to_csv('output.csv', index=False)
df.to_csv('output.csv', sep='\t', encoding='utf-8')

GroupBy

# GroupBy basique
grouped = df.groupby('category')['value'].mean()

# Multiple aggregations
agg_result = df.groupby('category').agg({
    'value': ['mean', 'std', 'count'],
    'price': ['min', 'max', 'median']
})

# Custom aggregation
def custom_metric(x):
    return (x.max() - x.min()) / x.mean()

df.groupby('category')['value'].apply(custom_metric)

# Multiple groupby keys
df.groupby(['category', 'subcategory']).mean()

# Transform (garde shape originale)
df['normalized'] = df.groupby('category')['value'].transform(
    lambda x: (x - x.mean()) / x.std()
)

Merge & Join

# Merge (comme SQL JOIN)
result = pd.merge(df1, df2, on='key', how='inner')  # inner, left, right, outer

# Merge sur multiple colonnes
result = pd.merge(df1, df2, on=['key1', 'key2'])

# Merge avec suffixes
result = pd.merge(df1, df2, on='key', suffixes=('_left', '_right'))

# Join sur index
result = df1.join(df2, how='left')

# Concat (empile dataframes)
result = pd.concat([df1, df2], axis=0)  # Vertical
result = pd.concat([df1, df2], axis=1)  # Horizontal

Pivot Table

# Pivot table
pivot = df.pivot_table(
    values='sales',
    index='date',
    columns='category',
    aggfunc='sum',
    fill_value=0
)

# Multiple aggregations
pivot_multi = df.pivot_table(
    values='sales',
    index='date',
    columns='category',
    aggfunc=['sum', 'mean', 'count']
)

# Pivot simple (pas d'aggregation)
pivoted = df.pivot(index='date', columns='category', values='sales')

Gestion valeurs manquantes

# Détecter NaN
df.isna().sum()  # Nombre par colonne
df.isnull().any()  # Colonnes avec NaN

# Remplir NaN
df.fillna(0)  # Avec constante
df.fillna(method='ffill')  # Forward fill
df.fillna(method='bfill')  # Backward fill
df.fillna(df.mean())  # Avec moyenne

# Remplir par colonne
df['age'].fillna(df['age'].median(), inplace=True)

# Supprimer NaN
df.dropna()  # Supprime lignes avec NaN
df.dropna(axis=1)  # Supprime colonnes avec NaN
df.dropna(thresh=2)  # Garde lignes avec au moins 2 non-NaN

🚀 Optimisation: Utilisez df.query() au lieu de df[df['col'] > 5] pour de meilleures performances sur gros datasets.

📊 Matplotlib & Plotly

Matplotlib Subplots

import matplotlib.pyplot as plt
import numpy as np

# Subplots basiques
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes[0, 0].plot(x, y)
axes[0, 0].set_title('Line Plot')

axes[0, 1].scatter(x, y, alpha=0.5)
axes[0, 1].set_title('Scatter')

axes[1, 0].hist(data, bins=30, edgecolor='black')
axes[1, 0].set_title('Histogram')

axes[1, 1].bar(categories, values)
axes[1, 1].set_title('Bar Chart')

plt.tight_layout()
plt.savefig('plots.png', dpi=300, bbox_inches='tight')
plt.show()

Heatmap & Confusion Matrix

import seaborn as sns
from sklearn.metrics import confusion_matrix

# Heatmap de corrélation
correlation = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm',
            center=0, square=True, linewidths=1)
plt.title('Matrice de Corrélation')
plt.show()

# Confusion Matrix
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 1, 1]
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

Plotly Interactive

import plotly.graph_objects as go
import plotly.express as px

# Line plot interactif
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1, mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=x, y=y2, mode='lines', name='Val'))
fig.update_layout(
    title='Training History',
    xaxis_title='Epoch',
    yaxis_title='Loss',
    hovermode='x unified'
)
fig.show()

# Plotly Express (plus simple)
df = px.data.iris()
fig = px.scatter(df, x='sepal_width', y='sepal_length',
                 color='species', size='petal_length',
                 hover_data=['petal_width'])
fig.show()

# 3D Scatter
fig = go.Figure(data=[go.Scatter3d(
    x=X[:, 0], y=X[:, 1], z=X[:, 2],
    mode='markers',
    marker=dict(size=5, color=labels, colorscale='Viridis')
)])
fig.show()

Visualisation Métriques ML

from sklearn.metrics import roc_curve, auc

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,
         label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

📓 Jupyter Tips

Magic Commands

# Mesurer temps d'exécution
%timeit sum(range(1000))
%%timeit
# Mesure tout le cell
result = expensive_computation()

# Matplotlib inline
%matplotlib inline
# Ou interactif
%matplotlib notebook

# Recharger modules automatiquement
%load_ext autoreload
%autoreload 2

# Afficher variables
%who  # Liste variables
%whos  # Détails variables

# Exécuter script Python
%run script.py

# Historique commandes
%history -n 1-10

# Debugging
%pdb  # Active debugger sur erreur

Cell Magics

# %%time - Temps d'exécution cell
%%time
df = pd.read_csv('large_file.csv')
result = df.groupby('category').mean()

# %%writefile - Écrire dans fichier
%%writefile utils.py
def helper_function():
    return "Hello from file"

# %%bash - Commandes bash
%%bash
ls -la
pwd
echo "Current directory"

# %%html - HTML custom
%%html
<div style="color: red;">Important!</div>

# %%javascript - JS dans notebook
%%javascript
alert('Hello from JS');

Extensions Utiles

# Installer extensions
pip install jupyter_contrib_nbextensions
jupyter contrib nbextension install --user

# Extensions recommandées:
# - Table of Contents: navigation facile
# - Variable Inspector: voir variables en temps réel
# - ExecuteTime: temps d'exécution cells
# - Collapsible Headings: replier sections
# - Code prettify: formatter code automatiquement

# JupyterLab extensions
jupyter labextension install @jupyterlab/toc
jupyter labextension install @jupyter-widgets/jupyterlab-manager

Keyboard Shortcuts

Raccourci	Action
`Shift + Enter`	Exécuter cell et passer à suivante
`Ctrl + Enter`	Exécuter cell
`A`	Insérer cell au dessus
`B`	Insérer cell en dessous
`DD`	Supprimer cell
`M`	Convertir en Markdown
`Y`	Convertir en Code
`Shift + M`	Fusionner cells

💡 Pro Tip: Utilisez ?function_name pour afficher la docstring, et ??function_name pour voir le code source!

🔀 Git pour ML

Git LFS (Large File Storage)

# Installer Git LFS
git lfs install

# Tracker fichiers volumineux
git lfs track "*.pth"
git lfs track "*.h5"
git lfs track "*.pkl"
git lfs track "data/*.csv"

# Vérifier fichiers trackés
git lfs ls-files

# Cloner repo avec LFS
git lfs clone https://github.com/user/repo.git

# Pull fichiers LFS
git lfs pull

.gitignore pour ML

# Python
__pycache__/
*.py[cod]
*$py.class
.Python
env/
venv/

# Jupyter
.ipynb_checkpoints
*.ipynb_checkpoints/

# Data
data/raw/*
data/processed/*
!data/.gitkeep

# Models
models/*.pth
models/*.h5
models/*.pkl
!models/.gitkeep

# Logs & Experiments
logs/
runs/
mlruns/
wandb/

# Environment
.env
.env.local
credentials.json

# IDE
.vscode/
.idea/
*.swp

# OS
.DS_Store
Thumbs.db

DVC (Data Version Control)

# Installer DVC
pip install dvc

# Initialiser DVC
dvc init

# Ajouter remote storage (S3, GCS, Azure...)
dvc remote add -d storage s3://mybucket/dvc-storage

# Tracker données avec DVC
dvc add data/train.csv
git add data/train.csv.dvc data/.gitignore
git commit -m "Add training data"

# Pousser données vers remote
dvc push

# Pull données
dvc pull

# Créer pipeline ML
dvc run -n preprocess \
    -d src/preprocess.py -d data/raw \
    -o data/processed \
    python src/preprocess.py

# Visualiser pipeline
dvc dag

Commits & Branches ML

# Branches pour expérimentations
git checkout -b experiment/transformer-v2
git checkout -b feature/data-augmentation

# Commit avec métriques
git commit -m "Train ResNet50 - Acc: 94.5% - Loss: 0.23"

# Tags pour versions modèles
git tag -a v1.0 -m "Model v1.0 - Production ready"
git push origin v1.0

# Stash pour sauvegarder WIP
git stash save "WIP: testing new architecture"
git stash list
git stash pop

# Rebase interactif pour nettoyer historique
git rebase -i HEAD~5

🎯 Best Practice: Ne jamais commiter les poids des modèles dans Git. Utilisez Git LFS ou DVC pour les fichiers > 100MB.

🧮 Algèbre Linéaire

Dot Product & Matrix Multiply

Dot Product: a · b = Σ aᵢbᵢ = |a||b|cos(θ)

# Dot product
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
dot = np.dot(a, b)  # 32

# Matrix multiplication
A = np.random.randn(3, 4)
B = np.random.randn(4, 5)
C = A @ B  # (3, 5)
C = np.matmul(A, B)  # Équivalent

# Batch matrix multiplication
batch_A = np.random.randn(10, 3, 4)
batch_B = np.random.randn(10, 4, 5)
batch_C = batch_A @ batch_B  # (10, 3, 5)

Eigenvalues & Eigenvectors

Équation propre: Av = λv

# Valeurs et vecteurs propres
A = np.random.randn(5, 5)
A = A @ A.T  # Matrice symétrique

eigenvalues, eigenvectors = np.linalg.eig(A)

# Vérification
v = eigenvectors[:, 0]
lambda_v = eigenvalues[0]
assert np.allclose(A @ v, lambda_v * v)

# Diagonalisation: A = VΛV⁻¹
Lambda = np.diag(eigenvalues)
V = eigenvectors
A_reconstructed = V @ Lambda @ np.linalg.inv(V)

SVD (Singular Value Decomposition)

SVD: A = UΣVᵀ

# SVD
A = np.random.randn(100, 50)
U, s, Vt = np.linalg.svd(A, full_matrices=False)

# U: (100, 50), s: (50,), Vt: (50, 50)
# Reconstruction
Sigma = np.diag(s)
A_reconstructed = U @ Sigma @ Vt

# Compression avec SVD (garder k composantes)
k = 10
A_compressed = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]

# Variance expliquée
variance_explained = np.cumsum(s**2) / np.sum(s**2)

Normes & Distances

# Normes de vecteurs
v = np.array([3, 4])

# L1 norm (Manhattan)
l1 = np.linalg.norm(v, ord=1)  # 7

# L2 norm (Euclidean)
l2 = np.linalg.norm(v, ord=2)  # 5

# L-inf norm (maximum)
linf = np.linalg.norm(v, ord=np.inf)  # 4

# Distance entre vecteurs
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
euclidean_dist = np.linalg.norm(a - b)

# Cosine similarity
cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

Opération	NumPy	Dimension
Dot product	`np.dot(a, b)`	(n,) × (n,) → scalaire
Matrix multiply	`A @ B`	(m,n) × (n,p) → (m,p)
Element-wise	`A * B`	(m,n) × (m,n) → (m,n)
Outer product	`np.outer(a, b)`	(m,) × (n,) → (m,n)

📐 Dérivées & Gradient

Chain Rule

Règle de la chaîne: d/dx[f(g(x))] = f'(g(x)) · g'(x)

# Exemple: y = (x² + 1)³
# f(u) = u³, g(x) = x² + 1
# dy/dx = 3u² · 2x = 3(x² + 1)² · 2x

import torch

x = torch.tensor(2.0, requires_grad=True)
y = (x**2 + 1)**3
y.backward()
print(f"dy/dx = {x.grad}")  # 180.0

# Multi-variable chain rule
x = torch.tensor(1.0, requires_grad=True)
y = torch.tensor(2.0, requires_grad=True)
z = x**2 + y**2
w = torch.exp(z)
w.backward()
print(f"dw/dx = {x.grad}")  # 2x·exp(x²+y²)
print(f"dw/dy = {y.grad}")  # 2y·exp(x²+y²)

Gradient Descent

Mise à jour: θ = θ - α∇J(θ)
α = learning rate, ∇J = gradient de la loss

# Gradient Descent simple
def gradient_descent(X, y, learning_rate=0.01, epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)

    for epoch in range(epochs):
        # Forward pass
        predictions = X @ theta

        # Compute gradient
        gradient = (1/m) * X.T @ (predictions - y)

        # Update parameters
        theta = theta - learning_rate * gradient

        # Compute loss
        loss = (1/(2*m)) * np.sum((predictions - y)**2)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}: Loss = {loss:.4f}")

    return theta

# Avec PyTorch
import torch.optim as optim

model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(100):
    predictions = model(X)
    loss = criterion(predictions, y)

    optimizer.zero_grad()  # Reset gradients
    loss.backward()        # Compute gradients
    optimizer.step()       # Update parameters

Learning Rate

# Learning rate trop grand: divergence
# Learning rate trop petit: convergence lente

# Stratégies learning rate
from torch.optim.lr_scheduler import *

# Step decay
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# Exponential decay
scheduler = ExponentialLR(optimizer, gamma=0.95)

# Cosine annealing
scheduler = CosineAnnealingLR(optimizer, T_max=100)

# Reduce on plateau
scheduler = ReduceLROnPlateau(optimizer, mode='min',
                              factor=0.1, patience=10)

# Utilisation
for epoch in range(100):
    train(...)
    val_loss = validate(...)
    scheduler.step(val_loss)  # ou scheduler.step() selon type

Dérivées Courantes

Fonction	Dérivée
xⁿ	nx^n-1
e^x	e^x
ln(x)	1/x
sin(x)	cos(x)
cos(x)	-sin(x)
1/(1+e^-x) (sigmoid)	σ(x)(1-σ(x))
tanh(x)	1 - tanh²(x)

⚠️ Attention: Toujours normaliser les données avant l'entraînement. Cela aide le gradient descent à converger plus rapidement.

🎲 Probabilités

Théorème de Bayes

Bayes: P(A|B) = P(B|A)P(A) / P(B)

# Exemple: Diagnostic médical
# P(Maladie|Test+) = P(Test+|Maladie) × P(Maladie) / P(Test+)

def bayes_theorem(p_b_given_a, p_a, p_b):
    """Calcule P(A|B)"""
    return (p_b_given_a * p_a) / p_b

# Sensibilité test: 95%, Prévalence: 1%, Spécificité: 90%
p_test_pos_given_sick = 0.95  # VP
p_sick = 0.01
p_test_pos_given_healthy = 0.10  # FP
p_healthy = 0.99

p_test_pos = (p_test_pos_given_sick * p_sick +
              p_test_pos_given_healthy * p_healthy)

p_sick_given_test_pos = bayes_theorem(
    p_test_pos_given_sick, p_sick, p_test_pos
)
print(f"P(Malade|Test+) = {p_sick_given_test_pos:.2%}")  # ~8.8%

Distributions

import numpy as np
from scipy import stats

# Distribution Normale (Gaussienne)
mu, sigma = 0, 1
normal = stats.norm(mu, sigma)
samples = normal.rvs(size=1000)
pdf = normal.pdf(x)  # Probability Density Function
cdf = normal.cdf(x)  # Cumulative Distribution Function

# Distribution Bernoulli (binaire)
p = 0.7
bernoulli = stats.bernoulli(p)
samples = bernoulli.rvs(size=100)  # 0 ou 1

# Distribution Binomiale
n, p = 10, 0.5
binomial = stats.binom(n, p)
samples = binomial.rvs(size=1000)

# Distribution Uniforme
uniform = stats.uniform(loc=0, scale=1)  # entre 0 et 1
samples = uniform.rvs(size=1000)

# Distribution Exponentielle
lambda_param = 1.5
exponential = stats.expon(scale=1/lambda_param)
samples = exponential.rvs(size=1000)

Softmax

Softmax: σ(z)ᵢ = e^zᵢ / Σⱼ e^zⱼ

def softmax(x):
    """Compute softmax (stable version)"""
    # Soustraire max pour stabilité numérique
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# Exemple
logits = np.array([2.0, 1.0, 0.1])
probs = softmax(logits)
print(probs)  # [0.659, 0.242, 0.099]
print(np.sum(probs))  # 1.0

# PyTorch
import torch.nn.functional as F
logits = torch.tensor([2.0, 1.0, 0.1])
probs = F.softmax(logits, dim=0)

# Log softmax (plus stable pour calculs)
log_probs = F.log_softmax(logits, dim=0)

Statistiques de Base

# Moyenne, médiane, écart-type
data = np.random.randn(1000)

mean = np.mean(data)
median = np.median(data)
std = np.std(data)
variance = np.var(data)

# Quantiles
q25, q50, q75 = np.percentile(data, [25, 50, 75])

# Corrélation
x = np.random.randn(100)
y = 2*x + np.random.randn(100)*0.5
correlation = np.corrcoef(x, y)[0, 1]

# Covariance
covariance = np.cov(x, y)[0, 1]

Distribution	Paramètres	Usage ML
Normale	μ (moyenne), σ (écart-type)	Initialisation poids, bruit
Bernoulli	p (probabilité succès)	Classification binaire
Uniforme	a (min), b (max)	Initialisation poids
Exponentielle	λ (rate)	Temps d'attente, durées

📊 Entropie & Information

Entropie de Shannon

Entropie: H(X) = -Σ P(x) log P(x)

def entropy(probs):
    """Calcule l'entropie de Shannon"""
    # Éviter log(0)
    probs = probs[probs > 0]
    return -np.sum(probs * np.log2(probs))

# Exemple
# Distribution uniforme: haute entropie
uniform_probs = np.array([0.25, 0.25, 0.25, 0.25])
print(f"Entropy (uniform): {entropy(uniform_probs):.2f} bits")  # 2.0

# Distribution concentrée: faible entropie
peaked_probs = np.array([0.9, 0.05, 0.03, 0.02])
print(f"Entropy (peaked): {entropy(peaked_probs):.2f} bits")  # ~0.64

Cross-Entropy

Cross-Entropy: H(P,Q) = -Σ P(x) log Q(x)

def cross_entropy(y_true, y_pred):
    """Cross-entropy entre vraie distribution et prédiction"""
    return -np.sum(y_true * np.log(y_pred + 1e-10))

# Exemple classification
y_true = np.array([0, 1, 0])  # One-hot classe 1
y_pred = np.array([0.1, 0.8, 0.1])  # Prédiction
ce = cross_entropy(y_true, y_pred)
print(f"Cross-Entropy: {ce:.4f}")  # 0.2231

# PyTorch CrossEntropyLoss
import torch.nn as nn

criterion = nn.CrossEntropyLoss()
logits = torch.tensor([[2.0, 1.0, 0.1]])  # Avant softmax
target = torch.tensor([0])  # Classe 0
loss = criterion(logits, target)

# Binary Cross-Entropy
bce = nn.BCELoss()
predictions = torch.tensor([0.8, 0.3, 0.9])
targets = torch.tensor([1.0, 0.0, 1.0])
loss = bce(predictions, targets)

KL Divergence

KL Divergence: D_KL(P||Q) = Σ P(x) log(P(x)/Q(x))

def kl_divergence(p, q):
    """Divergence KL de P vers Q"""
    return np.sum(p * np.log(p / (q + 1e-10) + 1e-10))

# Exemple
p = np.array([0.4, 0.3, 0.3])
q = np.array([0.3, 0.4, 0.3])
kl = kl_divergence(p, q)
print(f"KL(P||Q): {kl:.4f}")

# KL n'est PAS symétrique
kl_reverse = kl_divergence(q, p)
print(f"KL(Q||P): {kl_reverse:.4f}")

# PyTorch KLDivLoss
kl_loss = nn.KLDivLoss(reduction='batchmean')
# Input doit être log-probabilities
log_q = torch.log(torch.tensor(q))
p_tensor = torch.tensor(p)
loss = kl_loss(log_q, p_tensor)

Perplexity

Perplexity: PP(P) = 2^H(P) = 2^{-Σ P(x) log P(x)}

def perplexity(probs):
    """Calcule perplexity d'une distribution"""
    return 2 ** entropy(probs)

# Exemple: Language Model
# Perplexity mesure combien le modèle est "perplexe"
vocab_size = 10000
# Modèle uniforme (très mauvais)
uniform = np.ones(vocab_size) / vocab_size
pp_uniform = perplexity(uniform)
print(f"Perplexity (uniform): {pp_uniform:.0f}")  # 10000

# Bon modèle (concentré)
good_model = np.zeros(vocab_size)
good_model[:100] = 0.01  # Masse sur 100 mots
pp_good = perplexity(good_model)
print(f"Perplexity (good): {pp_good:.0f}")  # ~100

Métrique	Formule	Interprétation
Entropy	-Σ P log P	Incertitude moyenne
Cross-Entropy	-Σ P log Q	Loss classification
KL Divergence	Σ P log(P/Q)	Distance entre distributions
Perplexity	2^H(P)	Équivalent branching factor

💡 Intuition: Cross-Entropy = Entropy + KL Divergence. En minimisant la cross-entropy, on minimise la divergence KL entre prédictions et vraie distribution.

⚡ Optimiseurs

SGD (Stochastic Gradient Descent)

SGD: θ = θ - α∇J(θ)
SGD + Momentum: v = βv + ∇J(θ); θ = θ - αv

import torch.optim as optim

# SGD basique
optimizer = optim.SGD(model.parameters(), lr=0.01)

# SGD avec momentum
optimizer = optim.SGD(model.parameters(), lr=0.01,
                      momentum=0.9)

# SGD avec momentum et weight decay
optimizer = optim.SGD(model.parameters(), lr=0.01,
                      momentum=0.9, weight_decay=1e-4)

# Nesterov momentum (accélération)
optimizer = optim.SGD(model.parameters(), lr=0.01,
                      momentum=0.9, nesterov=True)

Adam & AdamW

Adam:
m = β₁m + (1-β₁)∇J (moment 1er ordre)
v = β₂v + (1-β₂)∇J² (moment 2e ordre)
θ = θ - α·m̂/√(v̂ + ε)

# Adam (le plus populaire)
optimizer = optim.Adam(model.parameters(), lr=0.001,
                       betas=(0.9, 0.999), eps=1e-8)

# AdamW (meilleur pour transformers)
# Découple weight decay du gradient
optimizer = optim.AdamW(model.parameters(), lr=0.001,
                        betas=(0.9, 0.999),
                        weight_decay=0.01)

# Valeurs typiques:
# - lr: 1e-3 to 1e-4
# - beta1: 0.9
# - beta2: 0.999
# - weight_decay: 0.01 (AdamW) ou 0 (Adam)

Autres Optimiseurs

# RMSprop (bon pour RNNs)
optimizer = optim.RMSprop(model.parameters(), lr=0.01,
                          alpha=0.99, eps=1e-8)

# Adagrad (rare aujourd'hui)
optimizer = optim.Adagrad(model.parameters(), lr=0.01)

# Adadelta
optimizer = optim.Adadelta(model.parameters(), lr=1.0)

# LAMB (pour très gros batches)
# pip install pytorch-lamb
from pytorch_lamb import Lamb
optimizer = Lamb(model.parameters(), lr=0.001)

Learning Rate Scheduling

from torch.optim.lr_scheduler import *

# Linear warmup puis cosine decay
def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps,
                                     num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / \
                   float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))

    return LambdaLR(optimizer, lr_lambda)

# One Cycle Policy (très efficace)
scheduler = OneCycleLR(optimizer, max_lr=0.01,
                       steps_per_epoch=len(train_loader),
                       epochs=epochs)

# Training loop avec scheduler
for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        loss = model(batch)
        loss.backward()
        optimizer.step()
        scheduler.step()  # Appel après chaque batch

Optimiseur	LR Typique	Cas d'usage
SGD + Momentum	0.01 - 0.1	Vision (ResNet, etc.)
Adam	1e-3 - 1e-4	Usage général
AdamW	1e-3 - 1e-5	Transformers, NLP
RMSprop	1e-3 - 1e-4	RNNs, Reinforcement Learning

🎯 Conseil: Pour transformers, utilisez AdamW avec warmup. Pour CNN, SGD + momentum converge souvent mieux qu'Adam.

🔬 Scikit-learn

Pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

# Créer pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),  # Garde 95% variance
    ('classifier', RandomForestClassifier(n_estimators=100))
])

# Fit & predict
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Accéder aux étapes
scaler = pipeline.named_steps['scaler']
pca = pipeline.named_steps['pca']

GridSearchCV

from sklearn.model_selection import GridSearchCV

# Définir grille de paramètres
param_grid = {
    'pca__n_components': [10, 20, 50],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5, 10]
}

# GridSearch
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1_weighted',
    n_jobs=-1,  # Parallélisation
    verbose=2
)

grid_search.fit(X_train, y_train)

# Meilleurs paramètres
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

# Meilleur modèle
best_model = grid_search.best_estimator_

Train-Test Split

from sklearn.model_selection import train_test_split

# Split basique
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Stratified split (garde proportions classes)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Split en 3: train/val/test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)  # 70% train, 15% val, 15% test

Métriques Courantes

from sklearn.metrics import *

# Classification
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Rapport complet
print(classification_report(y_true, y_pred))

# Matrice de confusion
cm = confusion_matrix(y_true, y_pred)

# ROC-AUC
roc_auc = roc_auc_score(y_true, y_proba, multi_class='ovr')

# Régression
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

Preprocessing

from sklearn.preprocessing import *

# Standardization (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Min-Max scaling (range [0,1])
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X_train)

# Robust scaling (robuste aux outliers)
robust = RobustScaler()
X_robust = robust.fit_transform(X_train)

# Label encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encoding
ohe = OneHotEncoder(sparse=False)
X_onehot = ohe.fit_transform(X_categorical)

📈 Évaluation Modèles

Precision, Recall, F1

Precision: TP / (TP + FP)
Recall: TP / (TP + FN)
F1: 2 × (Precision × Recall) / (Precision + Recall)

from sklearn.metrics import precision_recall_fscore_support

# Calcul des métriques
precision, recall, f1, support = precision_recall_fscore_support(
    y_true, y_pred, average='weighted'
)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Par classe
precision, recall, f1, support = precision_recall_fscore_support(
    y_true, y_pred, average=None
)

for i, (p, r, f, s) in enumerate(zip(precision, recall, f1, support)):
    print(f"Class {i}: P={p:.3f}, R={r:.3f}, F1={f:.3f}, n={s}")

Confusion Matrix

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculer matrice
cm = confusion_matrix(y_true, y_pred)

# Visualiser
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=class_names)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

# Métriques depuis confusion matrix
# Pour binaire (2x2)
TN, FP, FN, TP = cm.ravel()

accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)  # = sensitivity
specificity = TN / (TN + FP)
f1 = 2 * (precision * recall) / (precision + recall)

ROC & AUC

from sklearn.metrics import roc_curve, roc_auc_score, auc

# Classification binaire
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,
         label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Multi-class ROC
from sklearn.preprocessing import label_binarize
y_bin = label_binarize(y_true, classes=[0, 1, 2])
n_classes = y_bin.shape[1]

# Compute ROC per class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Cross-Validation

from sklearn.model_selection import cross_val_score, cross_validate

# Cross-validation simple
scores = cross_val_score(model, X, y, cv=5, scoring='f1_weighted')
print(f"F1 scores: {scores}")
print(f"Mean: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Multiple métriques
scoring = ['precision_weighted', 'recall_weighted', 'f1_weighted']
scores = cross_validate(model, X, y, cv=5, scoring=scoring)

print(f"Precision: {scores['test_precision_weighted'].mean():.4f}")
print(f"Recall: {scores['test_recall_weighted'].mean():.4f}")
print(f"F1: {scores['test_f1_weighted'].mean():.4f}")

# Stratified K-Fold
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=skf)

Métrique	Quand l'utiliser
Accuracy	Classes équilibrées
Precision	Minimiser faux positifs (spam, fraude)
Recall	Minimiser faux négatifs (maladie)
F1	Balance precision/recall, classes déséquilibrées
AUC-ROC	Comparer modèles, indépendant du seuil

⚠️ Classes déséquilibrées: N'utilisez pas accuracy! Préférez F1-score, AUC-ROC, ou precision/recall selon votre cas d'usage.

🧠 Réseaux de Neurones

Perceptron

Perceptron: y = σ(w·x + b)
w = poids, b = biais, σ = fonction d'activation

import numpy as np

class Perceptron:
    def __init__(self, n_inputs):
        self.weights = np.random.randn(n_inputs)
        self.bias = 0

    def predict(self, X):
        """Forward pass"""
        z = np.dot(X, self.weights) + self.bias
        return self.activation(z)

    def activation(self, z):
        """Step function"""
        return np.where(z >= 0, 1, 0)

    def fit(self, X, y, epochs=100, lr=0.01):
        """Training avec règle de Hebb"""
        for epoch in range(epochs):
            for xi, target in zip(X, y):
                prediction = self.predict(xi)
                error = target - prediction
                self.weights += lr * error * xi
                self.bias += lr * error

Fonctions d'Activation

import torch
import torch.nn as nn

# Sigmoid: σ(x) = 1 / (1 + e^(-x))
sigmoid = nn.Sigmoid()
# Sortie: (0, 1), utilisé pour probabilités binaires

# Tanh: tanh(x) = (e^x - e^(-x)) / (e^x + e^(-x))
tanh = nn.Tanh()
# Sortie: (-1, 1), centré sur 0

# ReLU: max(0, x)
relu = nn.ReLU()
# Sortie: [0, ∞), le plus populaire

# Leaky ReLU: max(0.01x, x)
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
# Résout problème "dying ReLU"

# ELU: x si x > 0, α(e^x - 1) sinon
elu = nn.ELU(alpha=1.0)

# GELU: x·Φ(x) (Gaussian Error Linear Unit)
gelu = nn.GELU()
# Utilisé dans BERT, GPT

# Softmax: e^xi / Σe^xj
softmax = nn.Softmax(dim=-1)
# Pour classification multi-classe

Backpropagation

Chain rule: ∂L/∂w = ∂L/∂y · ∂y/∂z · ∂z/∂w

# Backprop manuel (éducatif)
class SimpleNet:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialisation
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))

    def forward(self, X):
        # Forward pass
        self.z1 = X @ self.W1 + self.b1
        self.a1 = np.maximum(0, self.z1)  # ReLU
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = 1 / (1 + np.exp(-self.z2))  # Sigmoid
        return self.a2

    def backward(self, X, y, output, lr=0.01):
        m = X.shape[0]

        # Backward pass
        dz2 = output - y
        dW2 = (1/m) * self.a1.T @ dz2
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)

        da1 = dz2 @ self.W2.T
        dz1 = da1 * (self.z1 > 0)  # ReLU derivative
        dW1 = (1/m) * X.T @ dz1
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)

        # Update weights
        self.W1 -= lr * dW1
        self.b1 -= lr * db1
        self.W2 -= lr * dW2
        self.b2 -= lr * db2

MLP avec PyTorch

class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super().__init__()

        layers = []
        prev_size = input_size

        # Hidden layers
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            prev_size = hidden_size

        # Output layer
        layers.append(nn.Linear(prev_size, output_size))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

# Créer modèle
model = MLP(input_size=784,
            hidden_sizes=[512, 256, 128],
            output_size=10)

Activation	Range	Usage
Sigmoid	(0, 1)	Classification binaire (sortie)
Tanh	(-1, 1)	Hidden layers (rare maintenant)
ReLU	[0, ∞)	Hidden layers (standard)
Leaky ReLU	(-∞, ∞)	Éviter dying ReLU
GELU	(-∞, ∞)	Transformers
Softmax	Σ = 1	Multi-classe (sortie)

🔥 PyTorch Essentials

Tensors

import torch

# Créer tensors
x = torch.tensor([1, 2, 3])
y = torch.zeros(3, 4)
z = torch.ones(2, 3)
rand = torch.randn(5, 5)  # Distribution normale

# Opérations
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])
c = a + b  # Element-wise
d = a * b  # Element-wise
e = torch.dot(a, b)  # Dot product

# Matrix operations
A = torch.randn(3, 4)
B = torch.randn(4, 5)
C = A @ B  # Matrix multiply (3, 5)
C = torch.matmul(A, B)  # Équivalent

# Reshape
x = torch.arange(12)
x = x.view(3, 4)  # Reshape
x = x.view(-1, 2)  # Auto-infer: (6, 2)
x = x.reshape(3, 4)  # Alternatif

# Device (CPU/GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = x.to(device)
y = torch.randn(3, 4, device=device)  # Créer direct sur GPU

Autograd

# Autograd: différentiation automatique
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)

# Forward pass
z = x**2 + y**3
w = z * 2

# Backward pass
w.backward()

# Gradients
print(f"dw/dx = {x.grad}")  # 8.0 (= 2 * 2x)
print(f"dw/dy = {y.grad}")  # 54.0 (= 2 * 3y²)

# Réinitialiser gradients
x.grad.zero_()
y.grad.zero_()

# No grad (pour inference)
with torch.no_grad():
    predictions = model(x_test)

# Ou
@torch.no_grad()
def evaluate(model, data):
    return model(data)

nn.Module

import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()

        # Définir layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        """Forward pass"""
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Créer modèle
model = MyModel(input_dim=784, hidden_dim=256, output_dim=10)

# Déplacer sur GPU
model = model.to(device)

# Voir paramètres
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)

DataLoader

from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Créer dataset
dataset = CustomDataset(X_train, y_train)

# Créer dataloader
dataloader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,  # Parallélisation
    pin_memory=True  # Optimisation GPU
)

# Itération
for batch_x, batch_y in dataloader:
    batch_x = batch_x.to(device)
    batch_y = batch_y.to(device)
    # Training...

Training Loop

import torch.optim as optim

# Setup
model = MyModel(784, 256, 10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Mode training
    train_loss = 0

    for batch_x, batch_y in train_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        # Forward pass
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation
    model.eval()  # Mode evaluation
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)

            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()

            _, predicted = outputs.max(1)
            total += batch_y.size(0)
            correct += predicted.eq(batch_y).sum().item()

    accuracy = 100. * correct / total
    print(f"Epoch {epoch+1}: "
          f"Train Loss: {train_loss/len(train_loader):.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, "
          f"Val Acc: {accuracy:.2f}%")

⚡ Performance: Utilisez pin_memory=True dans DataLoader et .to(device, non_blocking=True) pour transferts GPU plus rapides.

📉 Loss Functions

Mean Squared Error (MSE)

MSE: L = (1/n) Σ (yᵢ - ŷᵢ)²

import torch.nn as nn

# MSE Loss (régression)
mse_loss = nn.MSELoss()

predictions = torch.tensor([2.5, 3.7, 1.2])
targets = torch.tensor([3.0, 4.0, 1.0])
loss = mse_loss(predictions, targets)

# MAE Loss (plus robuste aux outliers)
mae_loss = nn.L1Loss()
loss = mae_loss(predictions, targets)

# Huber Loss (combine MSE et MAE)
huber_loss = nn.SmoothL1Loss()
loss = huber_loss(predictions, targets)

Cross-Entropy Loss

CE: L = -Σ yᵢ log(ŷᵢ)
Pour classification: L = -log(ŷ_{classe_vraie})

# CrossEntropyLoss (classification multi-classe)
# Combine LogSoftmax + NLLLoss
ce_loss = nn.CrossEntropyLoss()

# Input: logits (avant softmax), shape (batch, num_classes)
logits = torch.tensor([[2.0, 1.0, 0.1],
                       [0.5, 2.5, 0.3]])
# Target: indices de classe, shape (batch,)
targets = torch.tensor([0, 1])

loss = ce_loss(logits, targets)

# Avec poids de classe (pour déséquilibre)
class_weights = torch.tensor([1.0, 2.0, 1.5])
weighted_ce = nn.CrossEntropyLoss(weight=class_weights)

# Label smoothing (régularisation)
ce_smoothed = nn.CrossEntropyLoss(label_smoothing=0.1)

Binary Cross-Entropy (BCE)

BCE: L = -[y log(ŷ) + (1-y) log(1-ŷ)]

# BCELoss (classification binaire)
# Input doit être entre 0 et 1 (après sigmoid)
bce_loss = nn.BCELoss()

predictions = torch.tensor([0.8, 0.3, 0.9])
targets = torch.tensor([1.0, 0.0, 1.0])
loss = bce_loss(predictions, targets)

# BCEWithLogitsLoss (plus stable numériquement)
# Input: logits (avant sigmoid)
bce_logits = nn.BCEWithLogitsLoss()

logits = torch.tensor([1.5, -0.8, 2.1])
targets = torch.tensor([1.0, 0.0, 1.0])
loss = bce_logits(logits, targets)

# Multi-label classification
# (plusieurs classes peuvent être vraies)
predictions = torch.tensor([[0.8, 0.3, 0.9],
                           [0.2, 0.7, 0.4]])
targets = torch.tensor([[1.0, 0.0, 1.0],
                       [0.0, 1.0, 0.0]])
loss = bce_loss(predictions, targets)

Autres Loss Functions

# NLLLoss (Negative Log Likelihood)
# Input doit être log-probabilities
nll_loss = nn.NLLLoss()
log_probs = torch.log_softmax(logits, dim=1)
loss = nll_loss(log_probs, targets)

# KLDivLoss (Divergence KL)
kl_loss = nn.KLDivLoss(reduction='batchmean')
# Input: log-probabilities, Target: probabilities
loss = kl_loss(log_probs, target_probs)

# CosineEmbeddingLoss (pour embeddings)
cosine_loss = nn.CosineEmbeddingLoss()
embedding1 = torch.randn(32, 128)
embedding2 = torch.randn(32, 128)
target = torch.ones(32)  # 1 si similaire, -1 si différent
loss = cosine_loss(embedding1, embedding2, target)

# TripletMarginLoss (pour metric learning)
triplet_loss = nn.TripletMarginLoss(margin=1.0)
anchor = torch.randn(32, 128)
positive = torch.randn(32, 128)  # Même classe
negative = torch.randn(32, 128)  # Classe différente
loss = triplet_loss(anchor, positive, negative)

Loss	Task	Input	Target
MSE	Régression	Valeurs réelles	Valeurs réelles
CrossEntropy	Multi-classe	Logits	Indices classe
BCE	Binaire	Probabilities [0,1]	0 ou 1
BCEWithLogits	Binaire	Logits	0 ou 1
NLL	Multi-classe	Log-probabilities	Indices classe

💡 Astuce: Utilisez toujours BCEWithLogitsLoss au lieu de Sigmoid + BCELoss, et CrossEntropyLoss au lieu de Softmax + NLLLoss. Plus stable numériquement!

🖼️ CNN Architecture

Conv2d Parameters

Output size: O = ⌊(W - K + 2P) / S⌋ + 1
W=input, K=kernel, P=padding, S=stride

import torch.nn as nn

# Convolution 2D
conv = nn.Conv2d(
    in_channels=3,      # RGB
    out_channels=64,    # Nombre de filtres
    kernel_size=3,      # 3x3
    stride=1,
    padding=1,          # 'same' padding
    bias=True
)

# Exemple calcul taille sortie
# Input: (batch, 3, 32, 32)
# Output: (batch, 64, 32, 32)
# Car: (32 - 3 + 2*1) / 1 + 1 = 32

# Depthwise Separable Convolution (MobileNet)
depthwise = nn.Conv2d(64, 64, kernel_size=3,
                      padding=1, groups=64)
pointwise = nn.Conv2d(64, 128, kernel_size=1)

# Dilated/Atrous Convolution
dilated = nn.Conv2d(64, 64, kernel_size=3,
                    padding=2, dilation=2)

Pooling Layers

# Max Pooling (le plus commun)
maxpool = nn.MaxPool2d(
    kernel_size=2,  # 2x2
    stride=2        # Réduit taille par 2
)
# Input: (batch, 64, 32, 32)
# Output: (batch, 64, 16, 16)

# Average Pooling
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)

# Adaptive Average Pooling (taille output fixe)
adaptive = nn.AdaptiveAvgPool2d((7, 7))
# Input: (batch, 512, 14, 14)
# Output: (batch, 512, 7, 7)

# Global Average Pooling
global_pool = nn.AdaptiveAvgPool2d((1, 1))
# Input: (batch, 512, 7, 7)
# Output: (batch, 512, 1, 1)

CNN Simple

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()

        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )

        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

ResNet Block

class ResidualBlock(nn.Module):
    """Bloc résiduel avec skip connection"""
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels,
                               kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(out_channels, out_channels,
                               kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Shortcut
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels,
                         kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        # Skip connection
        out += self.shortcut(identity)
        out = self.relu(out)

        return out

Layer	Paramètres	Usage
Conv2d	in_ch, out_ch, kernel, stride, padding	Extraction features
MaxPool2d	kernel, stride	Downsampling
BatchNorm2d	num_features	Normalisation, stabilité
Dropout2d	p (prob drop)	Régularisation
AdaptiveAvgPool2d	output_size	Pooling taille fixe

🎨 Architecture CNN: Pattern typique: Conv → BatchNorm → ReLU → Pool. Doubler les canaux quand on divise la résolution par 2.

🔄 RNN/LSTM/GRU

RNN Simple

RNN: h_t = tanh(W_hhh_t-1 + W_xhx_t + b)

import torch.nn as nn

# RNN basique
rnn = nn.RNN(
    input_size=100,      # Dimension input
    hidden_size=256,     # Dimension hidden state
    num_layers=2,        # Nombre couches empilées
    batch_first=True,    # Input shape: (batch, seq, features)
    dropout=0.2,         # Dropout entre couches
    bidirectional=False
)

# Forward pass
x = torch.randn(32, 10, 100)  # (batch, seq_len, input_size)
h0 = torch.zeros(2, 32, 256)   # (num_layers, batch, hidden_size)

output, hn = rnn(x, h0)
# output: (32, 10, 256) - output à chaque timestep
# hn: (2, 32, 256) - dernier hidden state

# RNN Cell (manuel)
rnn_cell = nn.RNNCell(input_size=100, hidden_size=256)
h = torch.zeros(32, 256)
outputs = []
for t in range(10):
    h = rnn_cell(x[:, t, :], h)
    outputs.append(h)

LSTM

LSTM gates:
f_t = σ(W_f·[h_t-1, x_t]) (forget)
i_t = σ(W_i·[h_t-1, x_t]) (input)
o_t = σ(W_o·[h_t-1, x_t]) (output)

# LSTM (le plus utilisé pour séquences)
lstm = nn.LSTM(
    input_size=100,
    hidden_size=256,
    num_layers=2,
    batch_first=True,
    dropout=0.2,
    bidirectional=True  # Bi-LSTM
)

# Forward pass
x = torch.randn(32, 10, 100)
h0 = torch.zeros(2*2, 32, 256)  # *2 car bidirectionnel
c0 = torch.zeros(2*2, 32, 256)  # Cell state

output, (hn, cn) = lstm(x, (h0, c0))
# output: (32, 10, 512) - 512 car bidirectionnel
# hn: (4, 32, 256) - hidden state
# cn: (4, 32, 256) - cell state

# Exemple: Sequence Classification
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim,
                           batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        output, (hn, cn) = self.lstm(embedded)

        # Utiliser dernier hidden state
        # Concatener forward et backward
        hidden = torch.cat([hn[-2], hn[-1]], dim=1)

        logits = self.fc(hidden)
        return logits

GRU

GRU: Plus simple que LSTM, souvent aussi bon
r_t = σ(W_r·[h_t-1, x_t]) (reset)
z_t = σ(W_z·[h_t-1, x_t]) (update)

# GRU (plus rapide que LSTM)
gru = nn.GRU(
    input_size=100,
    hidden_size=256,
    num_layers=2,
    batch_first=True,
    dropout=0.2,
    bidirectional=True
)

# Forward pass
x = torch.randn(32, 10, 100)
h0 = torch.zeros(2*2, 32, 256)

output, hn = gru(x, h0)
# Pas de cell state (plus simple que LSTM)

Quand utiliser chaque type

Type	Avantages	Quand utiliser
RNN	Simple, rapide	Séquences courtes, baseline
LSTM	Gère dépendances longues, stable	Séquences longues, standard NLP
GRU	Plus rapide que LSTM, moins de paramètres	Alternative à LSTM, contraintes compute
Bi-LSTM/GRU	Context bidirectionnel	Classification, pas génération

Handling Sequences

# Padding sequences (différentes longueurs)
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Sequences de longueurs variables
seqs = [torch.randn(5, 100),
        torch.randn(8, 100),
        torch.randn(3, 100)]

# Padding
padded = pad_sequence(seqs, batch_first=True)
# Shape: (3, 8, 100) - paddé à max length

# Pack (efficace pour training)
lengths = torch.tensor([5, 8, 3])
packed = pack_padded_sequence(padded, lengths,
                               batch_first=True,
                               enforce_sorted=False)

# Forward avec packed
output, hn = lstm(packed)

# Unpack
output, lengths = pad_packed_sequence(output, batch_first=True)

⚠️ Vanishing Gradient: RNN basiques souffrent de vanishing gradient sur longues séquences. LSTM/GRU résolvent ce problème. Pour séquences très longues (>500), considérez Transformer.

🤖 Transformer Basics

Self-Attention

Attention: Attention(Q,K,V) = softmax(QK^T/√d_k)V
Q=queries, K=keys, V=values, d_k=dimension

import torch
import torch.nn as nn
import math

class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim

        # Projections Q, K, V
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)

        self.scale = math.sqrt(embed_dim)

    def forward(self, x, mask=None):
        # x: (batch, seq_len, embed_dim)
        batch_size, seq_len, _ = x.shape

        # Compute Q, K, V
        Q = self.query(x)  # (batch, seq_len, embed_dim)
        K = self.key(x)
        V = self.value(x)

        # Attention scores
        scores = torch.bmm(Q, K.transpose(1, 2))  # (batch, seq_len, seq_len)
        scores = scores / self.scale

        # Apply mask (pour padding ou causal)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # Attention weights
        attn_weights = torch.softmax(scores, dim=-1)

        # Apply attention à values
        output = torch.bmm(attn_weights, V)  # (batch, seq_len, embed_dim)

        return output, attn_weights

Multi-Head Attention

Multi-Head: Concat(head₁, ..., head_h)W^O
où head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.out = nn.Linear(embed_dim, embed_dim)

        self.scale = math.sqrt(self.head_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, embed_dim = x.shape

        # Compute Q, K, V pour tous les heads
        qkv = self.qkv(x)  # (batch, seq_len, 3*embed_dim)
        qkv = qkv.reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, batch, heads, seq_len, head_dim)
        Q, K, V = qkv[0], qkv[1], qkv[2]

        # Attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(scores, dim=-1)

        # Apply attention
        output = torch.matmul(attn, V)  # (batch, heads, seq_len, head_dim)

        # Concatenate heads
        output = output.transpose(1, 2)  # (batch, seq_len, heads, head_dim)
        output = output.reshape(batch_size, seq_len, embed_dim)

        output = self.out(output)

        return output

# PyTorch built-in
mha = nn.MultiheadAttention(
    embed_dim=512,
    num_heads=8,
    dropout=0.1,
    batch_first=True
)

output, attn_weights = mha(x, x, x)  # Self-attention

Positional Encoding

Position:
PE_{(pos, 2i)} = sin(pos/10000^2i/d)
PE_{(pos, 2i+1)} = cos(pos/10000^2i/d)

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()

        # Créer matrice positional encoding
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() *
                           (-math.log(10000.0) / embed_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # (1, max_len, embed_dim)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch, seq_len, embed_dim)
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        return x

# Alternative: Learned positional embeddings
class LearnedPositionalEmbedding(nn.Module):
    def __init__(self, max_len, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(max_len, embed_dim)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
        return x + self.embedding(positions)

Transformer Block

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()

        # Multi-head attention
        self.attention = nn.MultiheadAttention(
            embed_dim, num_heads, dropout=dropout, batch_first=True
        )

        # Feed-forward
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, embed_dim),
            nn.Dropout(dropout)
        )

        # Layer norm
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Multi-head attention + residual + norm
        attn_output, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.norm1(x + self.dropout(attn_output))

        # Feed-forward + residual + norm
        ff_output = self.ff(x)
        x = self.norm2(x + ff_output)

        return x

# Transformer complet
class Transformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads,
                 num_layers, ff_dim, max_len, num_classes):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoding = PositionalEncoding(embed_dim, max_len)

        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim)
            for _ in range(num_layers)
        ])

        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # x: (batch, seq_len)
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        x = self.pos_encoding(x)

        for layer in self.layers:
            x = layer(x)

        # Pooling (utiliser [CLS] token ou moyenne)
        x = x.mean(dim=1)  # (batch, embed_dim)

        logits = self.classifier(x)
        return logits

Composant	Rôle
Self-Attention	Relations entre tokens
Multi-Head	Différents aspects d'attention
Positional Encoding	Information de position
Feed-Forward	Transformation non-linéaire
Layer Norm	Stabilisation training
Residual Connections	Gradient flow, deep networks

🚀 Transformers: Remplacent RNN/LSTM dans la plupart des tâches NLP. Parallélisables (vs RNN séquentiel) et capturent mieux les dépendances longues. Base de BERT, GPT, etc.