📚 Cheatsheet Phase 1

Fondations IA & Environnement

🔧 Setup Environnement IA

Créer un environnement Conda

# Créer environnement avec Python 3.10 conda create -n ia-env python=3.10 conda activate ia-env # Installer PyTorch avec CUDA 11.8 conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia # Ou avec pip pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Vérifier CUDA

import torch # Vérifier disponibilité CUDA print(f"CUDA disponible: {torch.cuda.is_available()}") print(f"Version CUDA: {torch.version.cuda}") print(f"Nombre de GPUs: {torch.cuda.device_count()}") print(f"GPU actuel: {torch.cuda.get_device_name(0)}") # Tester calcul GPU x = torch.rand(1000, 1000).cuda() y = torch.rand(1000, 1000).cuda() z = x @ y # Multiplication sur GPU

WSL2 pour Windows

# Installation WSL2 wsl --install wsl --set-default-version 2 # Installer Ubuntu wsl --install -d Ubuntu-22.04 # Dans WSL, installer conda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh bash Miniconda3-latest-Linux-x86_64.sh # Installer CUDA Toolkit dans WSL wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin sudo mv cuda-wsl-ubuntu.pin /etc/apt/preferences.d/cuda-repository-pin-600
💡 Astuce: Utilisez conda env export > environment.yml pour sauvegarder votre environnement et le partager.

🐍 Python Essentials

Decorators

import time from functools import wraps def timer(func): @wraps(func) def wrapper(*args, **kwargs): start = time.time() result = func(*args, **kwargs) print(f"{func.__name__} took {time.time()-start:.4f}s") return result return wrapper @timer def train_model(epochs): # Code d'entraînement time.sleep(2) # Property decorator class Model: def __init__(self): self._weights = None @property def weights(self): return self._weights @weights.setter def weights(self, value): self._weights = value

Generators

def data_generator(batch_size, num_batches): """Generator pour batches de données""" for i in range(num_batches): # Générer batch batch = torch.randn(batch_size, 784) labels = torch.randint(0, 10, (batch_size,)) yield batch, labels # Utilisation for batch, labels in data_generator(32, 100): # Traiter batch pass # Generator expression squares = (x**2 for x in range(1000000)) # Économise mémoire

Type Hints

from typing import List, Dict, Tuple, Optional, Union import numpy as np def process_data( data: np.ndarray, labels: Optional[np.ndarray] = None, batch_size: int = 32 ) -> Tuple[np.ndarray, Dict[str, float]]: """Traite les données et retourne résultats + métriques""" processed = data * 2.0 metrics = {"mean": float(data.mean()), "std": float(data.std())} return processed, metrics # Type pour modèle from torch import nn ModelType = nn.Module OptimizerType = Union[torch.optim.Adam, torch.optim.SGD]

Comprehensions

# List comprehension squares = [x**2 for x in range(10) if x % 2 == 0] # Dict comprehension metrics = {f"layer_{i}": torch.randn(10) for i in range(5)} # Set comprehension unique_labels = {label for batch in dataset for label in batch} # Nested comprehension matrix = [[i*j for j in range(5)] for i in range(5)]

🔢 NumPy Operations

Array Creation

import numpy as np # Différentes façons de créer arrays a = np.array([1, 2, 3, 4, 5]) b = np.zeros((3, 4)) c = np.ones((2, 3, 4)) d = np.arange(0, 10, 0.5) # 0 à 10 par pas de 0.5 e = np.linspace(0, 1, 100) # 100 valeurs entre 0 et 1 f = np.random.randn(5, 5) # Distribution normale g = np.eye(4) # Matrice identité 4x4 h = np.full((3, 3), 7) # Matrice remplie de 7

Reshaping

# Reshape x = np.arange(12) x_reshaped = x.reshape(3, 4) # 3x4 x_auto = x.reshape(-1, 2) # Calcul auto: 6x2 # Transpose x_t = x_reshaped.T x_swap = x.reshape(2, 3, 2).swapaxes(0, 1) # Flatten x_flat = x_reshaped.flatten() # Copie x_ravel = x_reshaped.ravel() # Vue si possible # Add dimension x_expanded = x[np.newaxis, :] # (1, 12) x_expanded2 = x[:, np.newaxis] # (12, 1)

Broadcasting

# Broadcasting examples a = np.array([[1, 2, 3], [4, 5, 6]]) # (2, 3) b = np.array([10, 20, 30]) # (3,) c = a + b # Broadcast b sur chaque ligne # Normalisation avec broadcasting X = np.random.randn(100, 10) mean = X.mean(axis=0, keepdims=True) # (1, 10) std = X.std(axis=0, keepdims=True) # (1, 10) X_normalized = (X - mean) / std # Broadcasting pour distance matrices points = np.random.randn(50, 3) # 50 points en 3D distances = np.sqrt(((points[:, np.newaxis] - points) ** 2).sum(axis=2))

Einsum

# Einsum: notation Einstein pour opérations tensorielles # Dot product a = np.random.randn(5) b = np.random.randn(5) dot = np.einsum('i,i->', a, b) # Équivalent à np.dot(a, b) # Matrix multiplication A = np.random.randn(3, 4) B = np.random.randn(4, 5) C = np.einsum('ij,jk->ik', A, B) # A @ B # Batch matrix multiplication batch_A = np.random.randn(10, 3, 4) batch_B = np.random.randn(10, 4, 5) batch_C = np.einsum('bij,bjk->bik', batch_A, batch_B) # Trace d'une matrice trace = np.einsum('ii->', A) # Outer product outer = np.einsum('i,j->ij', a, b)
⚡ Performance: Utilisez toujours des opérations vectorisées plutôt que des boucles Python. NumPy est jusqu'à 100x plus rapide!

🐼 Pandas Essentials

Read & Write CSV

import pandas as pd # Lecture CSV df = pd.read_csv('data.csv') df = pd.read_csv('data.csv', sep=';', encoding='utf-8', parse_dates=['date'], index_col=0) # Lecture avec chunks pour gros fichiers for chunk in pd.read_csv('huge_file.csv', chunksize=10000): process(chunk) # Écriture CSV df.to_csv('output.csv', index=False) df.to_csv('output.csv', sep='\t', encoding='utf-8')

GroupBy

# GroupBy basique grouped = df.groupby('category')['value'].mean() # Multiple aggregations agg_result = df.groupby('category').agg({ 'value': ['mean', 'std', 'count'], 'price': ['min', 'max', 'median'] }) # Custom aggregation def custom_metric(x): return (x.max() - x.min()) / x.mean() df.groupby('category')['value'].apply(custom_metric) # Multiple groupby keys df.groupby(['category', 'subcategory']).mean() # Transform (garde shape originale) df['normalized'] = df.groupby('category')['value'].transform( lambda x: (x - x.mean()) / x.std() )

Merge & Join

# Merge (comme SQL JOIN) result = pd.merge(df1, df2, on='key', how='inner') # inner, left, right, outer # Merge sur multiple colonnes result = pd.merge(df1, df2, on=['key1', 'key2']) # Merge avec suffixes result = pd.merge(df1, df2, on='key', suffixes=('_left', '_right')) # Join sur index result = df1.join(df2, how='left') # Concat (empile dataframes) result = pd.concat([df1, df2], axis=0) # Vertical result = pd.concat([df1, df2], axis=1) # Horizontal

Pivot Table

# Pivot table pivot = df.pivot_table( values='sales', index='date', columns='category', aggfunc='sum', fill_value=0 ) # Multiple aggregations pivot_multi = df.pivot_table( values='sales', index='date', columns='category', aggfunc=['sum', 'mean', 'count'] ) # Pivot simple (pas d'aggregation) pivoted = df.pivot(index='date', columns='category', values='sales')

Gestion valeurs manquantes

# Détecter NaN df.isna().sum() # Nombre par colonne df.isnull().any() # Colonnes avec NaN # Remplir NaN df.fillna(0) # Avec constante df.fillna(method='ffill') # Forward fill df.fillna(method='bfill') # Backward fill df.fillna(df.mean()) # Avec moyenne # Remplir par colonne df['age'].fillna(df['age'].median(), inplace=True) # Supprimer NaN df.dropna() # Supprime lignes avec NaN df.dropna(axis=1) # Supprime colonnes avec NaN df.dropna(thresh=2) # Garde lignes avec au moins 2 non-NaN
🚀 Optimisation: Utilisez df.query() au lieu de df[df['col'] > 5] pour de meilleures performances sur gros datasets.

📊 Matplotlib & Plotly

Matplotlib Subplots

import matplotlib.pyplot as plt import numpy as np # Subplots basiques fig, axes = plt.subplots(2, 2, figsize=(12, 8)) axes[0, 0].plot(x, y) axes[0, 0].set_title('Line Plot') axes[0, 1].scatter(x, y, alpha=0.5) axes[0, 1].set_title('Scatter') axes[1, 0].hist(data, bins=30, edgecolor='black') axes[1, 0].set_title('Histogram') axes[1, 1].bar(categories, values) axes[1, 1].set_title('Bar Chart') plt.tight_layout() plt.savefig('plots.png', dpi=300, bbox_inches='tight') plt.show()

Heatmap & Confusion Matrix

import seaborn as sns from sklearn.metrics import confusion_matrix # Heatmap de corrélation correlation = df.corr() plt.figure(figsize=(10, 8)) sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, square=True, linewidths=1) plt.title('Matrice de Corrélation') plt.show() # Confusion Matrix y_true = [0, 1, 2, 0, 1, 2] y_pred = [0, 2, 1, 0, 1, 1] cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.title('Confusion Matrix') plt.show()

Plotly Interactive

import plotly.graph_objects as go import plotly.express as px # Line plot interactif fig = go.Figure() fig.add_trace(go.Scatter(x=x, y=y1, mode='lines', name='Train')) fig.add_trace(go.Scatter(x=x, y=y2, mode='lines', name='Val')) fig.update_layout( title='Training History', xaxis_title='Epoch', yaxis_title='Loss', hovermode='x unified' ) fig.show() # Plotly Express (plus simple) df = px.data.iris() fig = px.scatter(df, x='sepal_width', y='sepal_length', color='species', size='petal_length', hover_data=['petal_width']) fig.show() # 3D Scatter fig = go.Figure(data=[go.Scatter3d( x=X[:, 0], y=X[:, 1], z=X[:, 2], mode='markers', marker=dict(size=5, color=labels, colorscale='Viridis') )]) fig.show()

Visualisation Métriques ML

from sklearn.metrics import roc_curve, auc # ROC Curve fpr, tpr, thresholds = roc_curve(y_true, y_scores) roc_auc = auc(fpr, tpr) plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show()

📓 Jupyter Tips

Magic Commands

# Mesurer temps d'exécution %timeit sum(range(1000)) %%timeit # Mesure tout le cell result = expensive_computation() # Matplotlib inline %matplotlib inline # Ou interactif %matplotlib notebook # Recharger modules automatiquement %load_ext autoreload %autoreload 2 # Afficher variables %who # Liste variables %whos # Détails variables # Exécuter script Python %run script.py # Historique commandes %history -n 1-10 # Debugging %pdb # Active debugger sur erreur

Cell Magics

# %%time - Temps d'exécution cell %%time df = pd.read_csv('large_file.csv') result = df.groupby('category').mean() # %%writefile - Écrire dans fichier %%writefile utils.py def helper_function(): return "Hello from file" # %%bash - Commandes bash %%bash ls -la pwd echo "Current directory" # %%html - HTML custom %%html <div style="color: red;">Important!</div> # %%javascript - JS dans notebook %%javascript alert('Hello from JS');

Extensions Utiles

# Installer extensions pip install jupyter_contrib_nbextensions jupyter contrib nbextension install --user # Extensions recommandées: # - Table of Contents: navigation facile # - Variable Inspector: voir variables en temps réel # - ExecuteTime: temps d'exécution cells # - Collapsible Headings: replier sections # - Code prettify: formatter code automatiquement # JupyterLab extensions jupyter labextension install @jupyterlab/toc jupyter labextension install @jupyter-widgets/jupyterlab-manager

Keyboard Shortcuts

Raccourci Action
Shift + Enter Exécuter cell et passer à suivante
Ctrl + Enter Exécuter cell
A Insérer cell au dessus
B Insérer cell en dessous
DD Supprimer cell
M Convertir en Markdown
Y Convertir en Code
Shift + M Fusionner cells
💡 Pro Tip: Utilisez ?function_name pour afficher la docstring, et ??function_name pour voir le code source!

🔀 Git pour ML

Git LFS (Large File Storage)

# Installer Git LFS git lfs install # Tracker fichiers volumineux git lfs track "*.pth" git lfs track "*.h5" git lfs track "*.pkl" git lfs track "data/*.csv" # Vérifier fichiers trackés git lfs ls-files # Cloner repo avec LFS git lfs clone https://github.com/user/repo.git # Pull fichiers LFS git lfs pull

.gitignore pour ML

# Python __pycache__/ *.py[cod] *$py.class .Python env/ venv/ # Jupyter .ipynb_checkpoints *.ipynb_checkpoints/ # Data data/raw/* data/processed/* !data/.gitkeep # Models models/*.pth models/*.h5 models/*.pkl !models/.gitkeep # Logs & Experiments logs/ runs/ mlruns/ wandb/ # Environment .env .env.local credentials.json # IDE .vscode/ .idea/ *.swp # OS .DS_Store Thumbs.db

DVC (Data Version Control)

# Installer DVC pip install dvc # Initialiser DVC dvc init # Ajouter remote storage (S3, GCS, Azure...) dvc remote add -d storage s3://mybucket/dvc-storage # Tracker données avec DVC dvc add data/train.csv git add data/train.csv.dvc data/.gitignore git commit -m "Add training data" # Pousser données vers remote dvc push # Pull données dvc pull # Créer pipeline ML dvc run -n preprocess \ -d src/preprocess.py -d data/raw \ -o data/processed \ python src/preprocess.py # Visualiser pipeline dvc dag

Commits & Branches ML

# Branches pour expérimentations git checkout -b experiment/transformer-v2 git checkout -b feature/data-augmentation # Commit avec métriques git commit -m "Train ResNet50 - Acc: 94.5% - Loss: 0.23" # Tags pour versions modèles git tag -a v1.0 -m "Model v1.0 - Production ready" git push origin v1.0 # Stash pour sauvegarder WIP git stash save "WIP: testing new architecture" git stash list git stash pop # Rebase interactif pour nettoyer historique git rebase -i HEAD~5
🎯 Best Practice: Ne jamais commiter les poids des modèles dans Git. Utilisez Git LFS ou DVC pour les fichiers > 100MB.

🧮 Algèbre Linéaire

Dot Product & Matrix Multiply

Dot Product: a · b = Σ aᵢbᵢ = |a||b|cos(θ)
# Dot product a = np.array([1, 2, 3]) b = np.array([4, 5, 6]) dot = np.dot(a, b) # 32 # Matrix multiplication A = np.random.randn(3, 4) B = np.random.randn(4, 5) C = A @ B # (3, 5) C = np.matmul(A, B) # Équivalent # Batch matrix multiplication batch_A = np.random.randn(10, 3, 4) batch_B = np.random.randn(10, 4, 5) batch_C = batch_A @ batch_B # (10, 3, 5)

Eigenvalues & Eigenvectors

Équation propre: Av = λv
# Valeurs et vecteurs propres A = np.random.randn(5, 5) A = A @ A.T # Matrice symétrique eigenvalues, eigenvectors = np.linalg.eig(A) # Vérification v = eigenvectors[:, 0] lambda_v = eigenvalues[0] assert np.allclose(A @ v, lambda_v * v) # Diagonalisation: A = VΛV⁻¹ Lambda = np.diag(eigenvalues) V = eigenvectors A_reconstructed = V @ Lambda @ np.linalg.inv(V)

SVD (Singular Value Decomposition)

SVD: A = UΣVᵀ
# SVD A = np.random.randn(100, 50) U, s, Vt = np.linalg.svd(A, full_matrices=False) # U: (100, 50), s: (50,), Vt: (50, 50) # Reconstruction Sigma = np.diag(s) A_reconstructed = U @ Sigma @ Vt # Compression avec SVD (garder k composantes) k = 10 A_compressed = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :] # Variance expliquée variance_explained = np.cumsum(s**2) / np.sum(s**2)

Normes & Distances

# Normes de vecteurs v = np.array([3, 4]) # L1 norm (Manhattan) l1 = np.linalg.norm(v, ord=1) # 7 # L2 norm (Euclidean) l2 = np.linalg.norm(v, ord=2) # 5 # L-inf norm (maximum) linf = np.linalg.norm(v, ord=np.inf) # 4 # Distance entre vecteurs a = np.array([1, 2, 3]) b = np.array([4, 5, 6]) euclidean_dist = np.linalg.norm(a - b) # Cosine similarity cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
Opération NumPy Dimension
Dot product np.dot(a, b) (n,) × (n,) → scalaire
Matrix multiply A @ B (m,n) × (n,p) → (m,p)
Element-wise A * B (m,n) × (m,n) → (m,n)
Outer product np.outer(a, b) (m,) × (n,) → (m,n)

📐 Dérivées & Gradient

Chain Rule

Règle de la chaîne: d/dx[f(g(x))] = f'(g(x)) · g'(x)
# Exemple: y = (x² + 1)³ # f(u) = u³, g(x) = x² + 1 # dy/dx = 3u² · 2x = 3(x² + 1)² · 2x import torch x = torch.tensor(2.0, requires_grad=True) y = (x**2 + 1)**3 y.backward() print(f"dy/dx = {x.grad}") # 180.0 # Multi-variable chain rule x = torch.tensor(1.0, requires_grad=True) y = torch.tensor(2.0, requires_grad=True) z = x**2 + y**2 w = torch.exp(z) w.backward() print(f"dw/dx = {x.grad}") # 2x·exp(x²+y²) print(f"dw/dy = {y.grad}") # 2y·exp(x²+y²)

Gradient Descent

Mise à jour: θ = θ - α∇J(θ)
α = learning rate, ∇J = gradient de la loss
# Gradient Descent simple def gradient_descent(X, y, learning_rate=0.01, epochs=1000): m, n = X.shape theta = np.zeros(n) for epoch in range(epochs): # Forward pass predictions = X @ theta # Compute gradient gradient = (1/m) * X.T @ (predictions - y) # Update parameters theta = theta - learning_rate * gradient # Compute loss loss = (1/(2*m)) * np.sum((predictions - y)**2) if epoch % 100 == 0: print(f"Epoch {epoch}: Loss = {loss:.4f}") return theta # Avec PyTorch import torch.optim as optim model = nn.Linear(10, 1) optimizer = optim.SGD(model.parameters(), lr=0.01) for epoch in range(100): predictions = model(X) loss = criterion(predictions, y) optimizer.zero_grad() # Reset gradients loss.backward() # Compute gradients optimizer.step() # Update parameters

Learning Rate

# Learning rate trop grand: divergence # Learning rate trop petit: convergence lente # Stratégies learning rate from torch.optim.lr_scheduler import * # Step decay scheduler = StepLR(optimizer, step_size=10, gamma=0.1) # Exponential decay scheduler = ExponentialLR(optimizer, gamma=0.95) # Cosine annealing scheduler = CosineAnnealingLR(optimizer, T_max=100) # Reduce on plateau scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10) # Utilisation for epoch in range(100): train(...) val_loss = validate(...) scheduler.step(val_loss) # ou scheduler.step() selon type

Dérivées Courantes

Fonction Dérivée
xn nxn-1
ex ex
ln(x) 1/x
sin(x) cos(x)
cos(x) -sin(x)
1/(1+e-x) (sigmoid) σ(x)(1-σ(x))
tanh(x) 1 - tanh²(x)
⚠️ Attention: Toujours normaliser les données avant l'entraînement. Cela aide le gradient descent à converger plus rapidement.

🎲 Probabilités

Théorème de Bayes

Bayes: P(A|B) = P(B|A)P(A) / P(B)
# Exemple: Diagnostic médical # P(Maladie|Test+) = P(Test+|Maladie) × P(Maladie) / P(Test+) def bayes_theorem(p_b_given_a, p_a, p_b): """Calcule P(A|B)""" return (p_b_given_a * p_a) / p_b # Sensibilité test: 95%, Prévalence: 1%, Spécificité: 90% p_test_pos_given_sick = 0.95 # VP p_sick = 0.01 p_test_pos_given_healthy = 0.10 # FP p_healthy = 0.99 p_test_pos = (p_test_pos_given_sick * p_sick + p_test_pos_given_healthy * p_healthy) p_sick_given_test_pos = bayes_theorem( p_test_pos_given_sick, p_sick, p_test_pos ) print(f"P(Malade|Test+) = {p_sick_given_test_pos:.2%}") # ~8.8%

Distributions

import numpy as np from scipy import stats # Distribution Normale (Gaussienne) mu, sigma = 0, 1 normal = stats.norm(mu, sigma) samples = normal.rvs(size=1000) pdf = normal.pdf(x) # Probability Density Function cdf = normal.cdf(x) # Cumulative Distribution Function # Distribution Bernoulli (binaire) p = 0.7 bernoulli = stats.bernoulli(p) samples = bernoulli.rvs(size=100) # 0 ou 1 # Distribution Binomiale n, p = 10, 0.5 binomial = stats.binom(n, p) samples = binomial.rvs(size=1000) # Distribution Uniforme uniform = stats.uniform(loc=0, scale=1) # entre 0 et 1 samples = uniform.rvs(size=1000) # Distribution Exponentielle lambda_param = 1.5 exponential = stats.expon(scale=1/lambda_param) samples = exponential.rvs(size=1000)

Softmax

Softmax: σ(z)ᵢ = ezᵢ / Σⱼ ezⱼ
def softmax(x): """Compute softmax (stable version)""" # Soustraire max pour stabilité numérique exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True)) return exp_x / np.sum(exp_x, axis=-1, keepdims=True) # Exemple logits = np.array([2.0, 1.0, 0.1]) probs = softmax(logits) print(probs) # [0.659, 0.242, 0.099] print(np.sum(probs)) # 1.0 # PyTorch import torch.nn.functional as F logits = torch.tensor([2.0, 1.0, 0.1]) probs = F.softmax(logits, dim=0) # Log softmax (plus stable pour calculs) log_probs = F.log_softmax(logits, dim=0)

Statistiques de Base

# Moyenne, médiane, écart-type data = np.random.randn(1000) mean = np.mean(data) median = np.median(data) std = np.std(data) variance = np.var(data) # Quantiles q25, q50, q75 = np.percentile(data, [25, 50, 75]) # Corrélation x = np.random.randn(100) y = 2*x + np.random.randn(100)*0.5 correlation = np.corrcoef(x, y)[0, 1] # Covariance covariance = np.cov(x, y)[0, 1]
Distribution Paramètres Usage ML
Normale μ (moyenne), σ (écart-type) Initialisation poids, bruit
Bernoulli p (probabilité succès) Classification binaire
Uniforme a (min), b (max) Initialisation poids
Exponentielle λ (rate) Temps d'attente, durées

📊 Entropie & Information

Entropie de Shannon

Entropie: H(X) = -Σ P(x) log P(x)
def entropy(probs): """Calcule l'entropie de Shannon""" # Éviter log(0) probs = probs[probs > 0] return -np.sum(probs * np.log2(probs)) # Exemple # Distribution uniforme: haute entropie uniform_probs = np.array([0.25, 0.25, 0.25, 0.25]) print(f"Entropy (uniform): {entropy(uniform_probs):.2f} bits") # 2.0 # Distribution concentrée: faible entropie peaked_probs = np.array([0.9, 0.05, 0.03, 0.02]) print(f"Entropy (peaked): {entropy(peaked_probs):.2f} bits") # ~0.64

Cross-Entropy

Cross-Entropy: H(P,Q) = -Σ P(x) log Q(x)
def cross_entropy(y_true, y_pred): """Cross-entropy entre vraie distribution et prédiction""" return -np.sum(y_true * np.log(y_pred + 1e-10)) # Exemple classification y_true = np.array([0, 1, 0]) # One-hot classe 1 y_pred = np.array([0.1, 0.8, 0.1]) # Prédiction ce = cross_entropy(y_true, y_pred) print(f"Cross-Entropy: {ce:.4f}") # 0.2231 # PyTorch CrossEntropyLoss import torch.nn as nn criterion = nn.CrossEntropyLoss() logits = torch.tensor([[2.0, 1.0, 0.1]]) # Avant softmax target = torch.tensor([0]) # Classe 0 loss = criterion(logits, target) # Binary Cross-Entropy bce = nn.BCELoss() predictions = torch.tensor([0.8, 0.3, 0.9]) targets = torch.tensor([1.0, 0.0, 1.0]) loss = bce(predictions, targets)

KL Divergence

KL Divergence: DKL(P||Q) = Σ P(x) log(P(x)/Q(x))
def kl_divergence(p, q): """Divergence KL de P vers Q""" return np.sum(p * np.log(p / (q + 1e-10) + 1e-10)) # Exemple p = np.array([0.4, 0.3, 0.3]) q = np.array([0.3, 0.4, 0.3]) kl = kl_divergence(p, q) print(f"KL(P||Q): {kl:.4f}") # KL n'est PAS symétrique kl_reverse = kl_divergence(q, p) print(f"KL(Q||P): {kl_reverse:.4f}") # PyTorch KLDivLoss kl_loss = nn.KLDivLoss(reduction='batchmean') # Input doit être log-probabilities log_q = torch.log(torch.tensor(q)) p_tensor = torch.tensor(p) loss = kl_loss(log_q, p_tensor)

Perplexity

Perplexity: PP(P) = 2H(P) = 2-Σ P(x) log P(x)
def perplexity(probs): """Calcule perplexity d'une distribution""" return 2 ** entropy(probs) # Exemple: Language Model # Perplexity mesure combien le modèle est "perplexe" vocab_size = 10000 # Modèle uniforme (très mauvais) uniform = np.ones(vocab_size) / vocab_size pp_uniform = perplexity(uniform) print(f"Perplexity (uniform): {pp_uniform:.0f}") # 10000 # Bon modèle (concentré) good_model = np.zeros(vocab_size) good_model[:100] = 0.01 # Masse sur 100 mots pp_good = perplexity(good_model) print(f"Perplexity (good): {pp_good:.0f}") # ~100
Métrique Formule Interprétation
Entropy -Σ P log P Incertitude moyenne
Cross-Entropy -Σ P log Q Loss classification
KL Divergence Σ P log(P/Q) Distance entre distributions
Perplexity 2H(P) Équivalent branching factor
💡 Intuition: Cross-Entropy = Entropy + KL Divergence. En minimisant la cross-entropy, on minimise la divergence KL entre prédictions et vraie distribution.

⚡ Optimiseurs

SGD (Stochastic Gradient Descent)

SGD: θ = θ - α∇J(θ)
SGD + Momentum: v = βv + ∇J(θ); θ = θ - αv
import torch.optim as optim # SGD basique optimizer = optim.SGD(model.parameters(), lr=0.01) # SGD avec momentum optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) # SGD avec momentum et weight decay optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4) # Nesterov momentum (accélération) optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

Adam & AdamW

Adam:
m = β₁m + (1-β₁)∇J (moment 1er ordre)
v = β₂v + (1-β₂)∇J² (moment 2e ordre)
θ = θ - α·m̂/√(v̂ + ε)
# Adam (le plus populaire) optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8) # AdamW (meilleur pour transformers) # Découple weight decay du gradient optimizer = optim.AdamW(model.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=0.01) # Valeurs typiques: # - lr: 1e-3 to 1e-4 # - beta1: 0.9 # - beta2: 0.999 # - weight_decay: 0.01 (AdamW) ou 0 (Adam)

Autres Optimiseurs

# RMSprop (bon pour RNNs) optimizer = optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99, eps=1e-8) # Adagrad (rare aujourd'hui) optimizer = optim.Adagrad(model.parameters(), lr=0.01) # Adadelta optimizer = optim.Adadelta(model.parameters(), lr=1.0) # LAMB (pour très gros batches) # pip install pytorch-lamb from pytorch_lamb import Lamb optimizer = Lamb(model.parameters(), lr=0.001)

Learning Rate Scheduling

from torch.optim.lr_scheduler import * # Linear warmup puis cosine decay def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps): def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / \ float(max(1, num_training_steps - num_warmup_steps)) return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress))) return LambdaLR(optimizer, lr_lambda) # One Cycle Policy (très efficace) scheduler = OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(train_loader), epochs=epochs) # Training loop avec scheduler for epoch in range(epochs): for batch in train_loader: optimizer.zero_grad() loss = model(batch) loss.backward() optimizer.step() scheduler.step() # Appel après chaque batch
Optimiseur LR Typique Cas d'usage
SGD + Momentum 0.01 - 0.1 Vision (ResNet, etc.)
Adam 1e-3 - 1e-4 Usage général
AdamW 1e-3 - 1e-5 Transformers, NLP
RMSprop 1e-3 - 1e-4 RNNs, Reinforcement Learning
🎯 Conseil: Pour transformers, utilisez AdamW avec warmup. Pour CNN, SGD + momentum converge souvent mieux qu'Adam.

🔬 Scikit-learn

Pipeline

from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier # Créer pipeline pipeline = Pipeline([ ('scaler', StandardScaler()), ('pca', PCA(n_components=0.95)), # Garde 95% variance ('classifier', RandomForestClassifier(n_estimators=100)) ]) # Fit & predict pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) # Accéder aux étapes scaler = pipeline.named_steps['scaler'] pca = pipeline.named_steps['pca']

GridSearchCV

from sklearn.model_selection import GridSearchCV # Définir grille de paramètres param_grid = { 'pca__n_components': [10, 20, 50], 'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [5, 10, None], 'classifier__min_samples_split': [2, 5, 10] } # GridSearch grid_search = GridSearchCV( pipeline, param_grid, cv=5, # 5-fold cross-validation scoring='f1_weighted', n_jobs=-1, # Parallélisation verbose=2 ) grid_search.fit(X_train, y_train) # Meilleurs paramètres print(f"Best params: {grid_search.best_params_}") print(f"Best score: {grid_search.best_score_:.4f}") # Meilleur modèle best_model = grid_search.best_estimator_

Train-Test Split

from sklearn.model_selection import train_test_split # Split basique X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Stratified split (garde proportions classes) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42 ) # Split en 3: train/val/test X_train, X_temp, y_train, y_temp = train_test_split( X, y, test_size=0.3, random_state=42 ) X_val, X_test, y_val, y_test = train_test_split( X_temp, y_temp, test_size=0.5, random_state=42 ) # 70% train, 15% val, 15% test

Métriques Courantes

from sklearn.metrics import * # Classification accuracy = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred, average='weighted') recall = recall_score(y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') # Rapport complet print(classification_report(y_true, y_pred)) # Matrice de confusion cm = confusion_matrix(y_true, y_pred) # ROC-AUC roc_auc = roc_auc_score(y_true, y_proba, multi_class='ovr') # Régression mse = mean_squared_error(y_true, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_true, y_pred) r2 = r2_score(y_true, y_pred)

Preprocessing

from sklearn.preprocessing import * # Standardization (mean=0, std=1) scaler = StandardScaler() X_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Min-Max scaling (range [0,1]) minmax = MinMaxScaler() X_normalized = minmax.fit_transform(X_train) # Robust scaling (robuste aux outliers) robust = RobustScaler() X_robust = robust.fit_transform(X_train) # Label encoding le = LabelEncoder() y_encoded = le.fit_transform(y) # One-hot encoding ohe = OneHotEncoder(sparse=False) X_onehot = ohe.fit_transform(X_categorical)

📈 Évaluation Modèles

Precision, Recall, F1

Precision: TP / (TP + FP)
Recall: TP / (TP + FN)
F1: 2 × (Precision × Recall) / (Precision + Recall)
from sklearn.metrics import precision_recall_fscore_support # Calcul des métriques precision, recall, f1, support = precision_recall_fscore_support( y_true, y_pred, average='weighted' ) print(f"Precision: {precision:.4f}") print(f"Recall: {recall:.4f}") print(f"F1-Score: {f1:.4f}") # Par classe precision, recall, f1, support = precision_recall_fscore_support( y_true, y_pred, average=None ) for i, (p, r, f, s) in enumerate(zip(precision, recall, f1, support)): print(f"Class {i}: P={p:.3f}, R={r:.3f}, F1={f:.3f}, n={s}")

Confusion Matrix

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # Calculer matrice cm = confusion_matrix(y_true, y_pred) # Visualiser disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names) disp.plot(cmap='Blues') plt.title('Confusion Matrix') plt.show() # Métriques depuis confusion matrix # Pour binaire (2x2) TN, FP, FN, TP = cm.ravel() accuracy = (TP + TN) / (TP + TN + FP + FN) precision = TP / (TP + FP) recall = TP / (TP + FN) # = sensitivity specificity = TN / (TN + FP) f1 = 2 * (precision * recall) / (precision + recall)

ROC & AUC

from sklearn.metrics import roc_curve, roc_auc_score, auc # Classification binaire fpr, tpr, thresholds = roc_curve(y_true, y_scores) roc_auc = auc(fpr, tpr) # Plot ROC curve plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show() # Multi-class ROC from sklearn.preprocessing import label_binarize y_bin = label_binarize(y_true, classes=[0, 1, 2]) n_classes = y_bin.shape[1] # Compute ROC per class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_scores[:, i]) roc_auc[i] = auc(fpr[i], tpr[i])

Cross-Validation

from sklearn.model_selection import cross_val_score, cross_validate # Cross-validation simple scores = cross_val_score(model, X, y, cv=5, scoring='f1_weighted') print(f"F1 scores: {scores}") print(f"Mean: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})") # Multiple métriques scoring = ['precision_weighted', 'recall_weighted', 'f1_weighted'] scores = cross_validate(model, X, y, cv=5, scoring=scoring) print(f"Precision: {scores['test_precision_weighted'].mean():.4f}") print(f"Recall: {scores['test_recall_weighted'].mean():.4f}") print(f"F1: {scores['test_f1_weighted'].mean():.4f}") # Stratified K-Fold from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) scores = cross_val_score(model, X, y, cv=skf)
Métrique Quand l'utiliser
Accuracy Classes équilibrées
Precision Minimiser faux positifs (spam, fraude)
Recall Minimiser faux négatifs (maladie)
F1 Balance precision/recall, classes déséquilibrées
AUC-ROC Comparer modèles, indépendant du seuil
⚠️ Classes déséquilibrées: N'utilisez pas accuracy! Préférez F1-score, AUC-ROC, ou precision/recall selon votre cas d'usage.

🧠 Réseaux de Neurones

Perceptron

Perceptron: y = σ(w·x + b)
w = poids, b = biais, σ = fonction d'activation
import numpy as np class Perceptron: def __init__(self, n_inputs): self.weights = np.random.randn(n_inputs) self.bias = 0 def predict(self, X): """Forward pass""" z = np.dot(X, self.weights) + self.bias return self.activation(z) def activation(self, z): """Step function""" return np.where(z >= 0, 1, 0) def fit(self, X, y, epochs=100, lr=0.01): """Training avec règle de Hebb""" for epoch in range(epochs): for xi, target in zip(X, y): prediction = self.predict(xi) error = target - prediction self.weights += lr * error * xi self.bias += lr * error

Fonctions d'Activation

import torch import torch.nn as nn # Sigmoid: σ(x) = 1 / (1 + e^(-x)) sigmoid = nn.Sigmoid() # Sortie: (0, 1), utilisé pour probabilités binaires # Tanh: tanh(x) = (e^x - e^(-x)) / (e^x + e^(-x)) tanh = nn.Tanh() # Sortie: (-1, 1), centré sur 0 # ReLU: max(0, x) relu = nn.ReLU() # Sortie: [0, ∞), le plus populaire # Leaky ReLU: max(0.01x, x) leaky_relu = nn.LeakyReLU(negative_slope=0.01) # Résout problème "dying ReLU" # ELU: x si x > 0, α(e^x - 1) sinon elu = nn.ELU(alpha=1.0) # GELU: x·Φ(x) (Gaussian Error Linear Unit) gelu = nn.GELU() # Utilisé dans BERT, GPT # Softmax: e^xi / Σe^xj softmax = nn.Softmax(dim=-1) # Pour classification multi-classe

Backpropagation

Chain rule: ∂L/∂w = ∂L/∂y · ∂y/∂z · ∂z/∂w
# Backprop manuel (éducatif) class SimpleNet: def __init__(self, input_size, hidden_size, output_size): # Initialisation self.W1 = np.random.randn(input_size, hidden_size) * 0.01 self.b1 = np.zeros((1, hidden_size)) self.W2 = np.random.randn(hidden_size, output_size) * 0.01 self.b2 = np.zeros((1, output_size)) def forward(self, X): # Forward pass self.z1 = X @ self.W1 + self.b1 self.a1 = np.maximum(0, self.z1) # ReLU self.z2 = self.a1 @ self.W2 + self.b2 self.a2 = 1 / (1 + np.exp(-self.z2)) # Sigmoid return self.a2 def backward(self, X, y, output, lr=0.01): m = X.shape[0] # Backward pass dz2 = output - y dW2 = (1/m) * self.a1.T @ dz2 db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True) da1 = dz2 @ self.W2.T dz1 = da1 * (self.z1 > 0) # ReLU derivative dW1 = (1/m) * X.T @ dz1 db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True) # Update weights self.W1 -= lr * dW1 self.b1 -= lr * db1 self.W2 -= lr * dW2 self.b2 -= lr * db2

MLP avec PyTorch

class MLP(nn.Module): def __init__(self, input_size, hidden_sizes, output_size): super().__init__() layers = [] prev_size = input_size # Hidden layers for hidden_size in hidden_sizes: layers.append(nn.Linear(prev_size, hidden_size)) layers.append(nn.ReLU()) layers.append(nn.Dropout(0.2)) prev_size = hidden_size # Output layer layers.append(nn.Linear(prev_size, output_size)) self.network = nn.Sequential(*layers) def forward(self, x): return self.network(x) # Créer modèle model = MLP(input_size=784, hidden_sizes=[512, 256, 128], output_size=10)
Activation Range Usage
Sigmoid (0, 1) Classification binaire (sortie)
Tanh (-1, 1) Hidden layers (rare maintenant)
ReLU [0, ∞) Hidden layers (standard)
Leaky ReLU (-∞, ∞) Éviter dying ReLU
GELU (-∞, ∞) Transformers
Softmax Σ = 1 Multi-classe (sortie)

🔥 PyTorch Essentials

Tensors

import torch # Créer tensors x = torch.tensor([1, 2, 3]) y = torch.zeros(3, 4) z = torch.ones(2, 3) rand = torch.randn(5, 5) # Distribution normale # Opérations a = torch.tensor([1.0, 2.0, 3.0]) b = torch.tensor([4.0, 5.0, 6.0]) c = a + b # Element-wise d = a * b # Element-wise e = torch.dot(a, b) # Dot product # Matrix operations A = torch.randn(3, 4) B = torch.randn(4, 5) C = A @ B # Matrix multiply (3, 5) C = torch.matmul(A, B) # Équivalent # Reshape x = torch.arange(12) x = x.view(3, 4) # Reshape x = x.view(-1, 2) # Auto-infer: (6, 2) x = x.reshape(3, 4) # Alternatif # Device (CPU/GPU) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') x = x.to(device) y = torch.randn(3, 4, device=device) # Créer direct sur GPU

Autograd

# Autograd: différentiation automatique x = torch.tensor(2.0, requires_grad=True) y = torch.tensor(3.0, requires_grad=True) # Forward pass z = x**2 + y**3 w = z * 2 # Backward pass w.backward() # Gradients print(f"dw/dx = {x.grad}") # 8.0 (= 2 * 2x) print(f"dw/dy = {y.grad}") # 54.0 (= 2 * 3y²) # Réinitialiser gradients x.grad.zero_() y.grad.zero_() # No grad (pour inference) with torch.no_grad(): predictions = model(x_test) # Ou @torch.no_grad() def evaluate(model, data): return model(data)

nn.Module

import torch.nn as nn class MyModel(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim): super().__init__() # Définir layers self.fc1 = nn.Linear(input_dim, hidden_dim) self.relu = nn.ReLU() self.dropout = nn.Dropout(0.2) self.fc2 = nn.Linear(hidden_dim, output_dim) def forward(self, x): """Forward pass""" x = self.fc1(x) x = self.relu(x) x = self.dropout(x) x = self.fc2(x) return x # Créer modèle model = MyModel(input_dim=784, hidden_dim=256, output_dim=10) # Déplacer sur GPU model = model.to(device) # Voir paramètres total_params = sum(p.numel() for p in model.parameters()) trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

DataLoader

from torch.utils.data import Dataset, DataLoader class CustomDataset(Dataset): def __init__(self, data, labels): self.data = data self.labels = labels def __len__(self): return len(self.data) def __getitem__(self, idx): return self.data[idx], self.labels[idx] # Créer dataset dataset = CustomDataset(X_train, y_train) # Créer dataloader dataloader = DataLoader( dataset, batch_size=32, shuffle=True, num_workers=4, # Parallélisation pin_memory=True # Optimisation GPU ) # Itération for batch_x, batch_y in dataloader: batch_x = batch_x.to(device) batch_y = batch_y.to(device) # Training...

Training Loop

import torch.optim as optim # Setup model = MyModel(784, 256, 10).to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) # Training loop num_epochs = 10 for epoch in range(num_epochs): model.train() # Mode training train_loss = 0 for batch_x, batch_y in train_loader: batch_x = batch_x.to(device) batch_y = batch_y.to(device) # Forward pass outputs = model(batch_x) loss = criterion(outputs, batch_y) # Backward pass optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() # Validation model.eval() # Mode evaluation val_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_x, batch_y in val_loader: batch_x = batch_x.to(device) batch_y = batch_y.to(device) outputs = model(batch_x) loss = criterion(outputs, batch_y) val_loss += loss.item() _, predicted = outputs.max(1) total += batch_y.size(0) correct += predicted.eq(batch_y).sum().item() accuracy = 100. * correct / total print(f"Epoch {epoch+1}: " f"Train Loss: {train_loss/len(train_loader):.4f}, " f"Val Loss: {val_loss/len(val_loader):.4f}, " f"Val Acc: {accuracy:.2f}%")
⚡ Performance: Utilisez pin_memory=True dans DataLoader et .to(device, non_blocking=True) pour transferts GPU plus rapides.

📉 Loss Functions

Mean Squared Error (MSE)

MSE: L = (1/n) Σ (yᵢ - ŷᵢ)²
import torch.nn as nn # MSE Loss (régression) mse_loss = nn.MSELoss() predictions = torch.tensor([2.5, 3.7, 1.2]) targets = torch.tensor([3.0, 4.0, 1.0]) loss = mse_loss(predictions, targets) # MAE Loss (plus robuste aux outliers) mae_loss = nn.L1Loss() loss = mae_loss(predictions, targets) # Huber Loss (combine MSE et MAE) huber_loss = nn.SmoothL1Loss() loss = huber_loss(predictions, targets)

Cross-Entropy Loss

CE: L = -Σ yᵢ log(ŷᵢ)
Pour classification: L = -log(ŷclasse_vraie)
# CrossEntropyLoss (classification multi-classe) # Combine LogSoftmax + NLLLoss ce_loss = nn.CrossEntropyLoss() # Input: logits (avant softmax), shape (batch, num_classes) logits = torch.tensor([[2.0, 1.0, 0.1], [0.5, 2.5, 0.3]]) # Target: indices de classe, shape (batch,) targets = torch.tensor([0, 1]) loss = ce_loss(logits, targets) # Avec poids de classe (pour déséquilibre) class_weights = torch.tensor([1.0, 2.0, 1.5]) weighted_ce = nn.CrossEntropyLoss(weight=class_weights) # Label smoothing (régularisation) ce_smoothed = nn.CrossEntropyLoss(label_smoothing=0.1)

Binary Cross-Entropy (BCE)

BCE: L = -[y log(ŷ) + (1-y) log(1-ŷ)]
# BCELoss (classification binaire) # Input doit être entre 0 et 1 (après sigmoid) bce_loss = nn.BCELoss() predictions = torch.tensor([0.8, 0.3, 0.9]) targets = torch.tensor([1.0, 0.0, 1.0]) loss = bce_loss(predictions, targets) # BCEWithLogitsLoss (plus stable numériquement) # Input: logits (avant sigmoid) bce_logits = nn.BCEWithLogitsLoss() logits = torch.tensor([1.5, -0.8, 2.1]) targets = torch.tensor([1.0, 0.0, 1.0]) loss = bce_logits(logits, targets) # Multi-label classification # (plusieurs classes peuvent être vraies) predictions = torch.tensor([[0.8, 0.3, 0.9], [0.2, 0.7, 0.4]]) targets = torch.tensor([[1.0, 0.0, 1.0], [0.0, 1.0, 0.0]]) loss = bce_loss(predictions, targets)

Autres Loss Functions

# NLLLoss (Negative Log Likelihood) # Input doit être log-probabilities nll_loss = nn.NLLLoss() log_probs = torch.log_softmax(logits, dim=1) loss = nll_loss(log_probs, targets) # KLDivLoss (Divergence KL) kl_loss = nn.KLDivLoss(reduction='batchmean') # Input: log-probabilities, Target: probabilities loss = kl_loss(log_probs, target_probs) # CosineEmbeddingLoss (pour embeddings) cosine_loss = nn.CosineEmbeddingLoss() embedding1 = torch.randn(32, 128) embedding2 = torch.randn(32, 128) target = torch.ones(32) # 1 si similaire, -1 si différent loss = cosine_loss(embedding1, embedding2, target) # TripletMarginLoss (pour metric learning) triplet_loss = nn.TripletMarginLoss(margin=1.0) anchor = torch.randn(32, 128) positive = torch.randn(32, 128) # Même classe negative = torch.randn(32, 128) # Classe différente loss = triplet_loss(anchor, positive, negative)
Loss Task Input Target
MSE Régression Valeurs réelles Valeurs réelles
CrossEntropy Multi-classe Logits Indices classe
BCE Binaire Probabilities [0,1] 0 ou 1
BCEWithLogits Binaire Logits 0 ou 1
NLL Multi-classe Log-probabilities Indices classe
💡 Astuce: Utilisez toujours BCEWithLogitsLoss au lieu de Sigmoid + BCELoss, et CrossEntropyLoss au lieu de Softmax + NLLLoss. Plus stable numériquement!

🖼️ CNN Architecture

Conv2d Parameters

Output size: O = ⌊(W - K + 2P) / S⌋ + 1
W=input, K=kernel, P=padding, S=stride
import torch.nn as nn # Convolution 2D conv = nn.Conv2d( in_channels=3, # RGB out_channels=64, # Nombre de filtres kernel_size=3, # 3x3 stride=1, padding=1, # 'same' padding bias=True ) # Exemple calcul taille sortie # Input: (batch, 3, 32, 32) # Output: (batch, 64, 32, 32) # Car: (32 - 3 + 2*1) / 1 + 1 = 32 # Depthwise Separable Convolution (MobileNet) depthwise = nn.Conv2d(64, 64, kernel_size=3, padding=1, groups=64) pointwise = nn.Conv2d(64, 128, kernel_size=1) # Dilated/Atrous Convolution dilated = nn.Conv2d(64, 64, kernel_size=3, padding=2, dilation=2)

Pooling Layers

# Max Pooling (le plus commun) maxpool = nn.MaxPool2d( kernel_size=2, # 2x2 stride=2 # Réduit taille par 2 ) # Input: (batch, 64, 32, 32) # Output: (batch, 64, 16, 16) # Average Pooling avgpool = nn.AvgPool2d(kernel_size=2, stride=2) # Adaptive Average Pooling (taille output fixe) adaptive = nn.AdaptiveAvgPool2d((7, 7)) # Input: (batch, 512, 14, 14) # Output: (batch, 512, 7, 7) # Global Average Pooling global_pool = nn.AdaptiveAvgPool2d((1, 1)) # Input: (batch, 512, 7, 7) # Output: (batch, 512, 1, 1)

CNN Simple

class SimpleCNN(nn.Module): def __init__(self, num_classes=10): super().__init__() self.features = nn.Sequential( # Block 1 nn.Conv2d(3, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2), # Block 2 nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2), # Block 3 nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.BatchNorm2d(256), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2), ) self.classifier = nn.Sequential( nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten(), nn.Linear(256, num_classes) ) def forward(self, x): x = self.features(x) x = self.classifier(x) return x

ResNet Block

class ResidualBlock(nn.Module): """Bloc résiduel avec skip connection""" def __init__(self, in_channels, out_channels, stride=1): super().__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(out_channels) self.relu = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(out_channels) # Shortcut self.shortcut = nn.Sequential() if stride != 1 or in_channels != out_channels: self.shortcut = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(out_channels) ) def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) # Skip connection out += self.shortcut(identity) out = self.relu(out) return out
Layer Paramètres Usage
Conv2d in_ch, out_ch, kernel, stride, padding Extraction features
MaxPool2d kernel, stride Downsampling
BatchNorm2d num_features Normalisation, stabilité
Dropout2d p (prob drop) Régularisation
AdaptiveAvgPool2d output_size Pooling taille fixe
🎨 Architecture CNN: Pattern typique: Conv → BatchNorm → ReLU → Pool. Doubler les canaux quand on divise la résolution par 2.

🔄 RNN/LSTM/GRU

RNN Simple

RNN: ht = tanh(Whhht-1 + Wxhxt + b)
import torch.nn as nn # RNN basique rnn = nn.RNN( input_size=100, # Dimension input hidden_size=256, # Dimension hidden state num_layers=2, # Nombre couches empilées batch_first=True, # Input shape: (batch, seq, features) dropout=0.2, # Dropout entre couches bidirectional=False ) # Forward pass x = torch.randn(32, 10, 100) # (batch, seq_len, input_size) h0 = torch.zeros(2, 32, 256) # (num_layers, batch, hidden_size) output, hn = rnn(x, h0) # output: (32, 10, 256) - output à chaque timestep # hn: (2, 32, 256) - dernier hidden state # RNN Cell (manuel) rnn_cell = nn.RNNCell(input_size=100, hidden_size=256) h = torch.zeros(32, 256) outputs = [] for t in range(10): h = rnn_cell(x[:, t, :], h) outputs.append(h)

LSTM

LSTM gates:
ft = σ(Wf·[ht-1, xt]) (forget)
it = σ(Wi·[ht-1, xt]) (input)
ot = σ(Wo·[ht-1, xt]) (output)
# LSTM (le plus utilisé pour séquences) lstm = nn.LSTM( input_size=100, hidden_size=256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True # Bi-LSTM ) # Forward pass x = torch.randn(32, 10, 100) h0 = torch.zeros(2*2, 32, 256) # *2 car bidirectionnel c0 = torch.zeros(2*2, 32, 256) # Cell state output, (hn, cn) = lstm(x, (h0, c0)) # output: (32, 10, 512) - 512 car bidirectionnel # hn: (4, 32, 256) - hidden state # cn: (4, 32, 256) - cell state # Exemple: Sequence Classification class LSTMClassifier(nn.Module): def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim) self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True) self.fc = nn.Linear(hidden_dim * 2, num_classes) def forward(self, x): # x: (batch, seq_len) embedded = self.embedding(x) # (batch, seq_len, embed_dim) output, (hn, cn) = self.lstm(embedded) # Utiliser dernier hidden state # Concatener forward et backward hidden = torch.cat([hn[-2], hn[-1]], dim=1) logits = self.fc(hidden) return logits

GRU

GRU: Plus simple que LSTM, souvent aussi bon
rt = σ(Wr·[ht-1, xt]) (reset)
zt = σ(Wz·[ht-1, xt]) (update)
# GRU (plus rapide que LSTM) gru = nn.GRU( input_size=100, hidden_size=256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True ) # Forward pass x = torch.randn(32, 10, 100) h0 = torch.zeros(2*2, 32, 256) output, hn = gru(x, h0) # Pas de cell state (plus simple que LSTM)

Quand utiliser chaque type

Type Avantages Quand utiliser
RNN Simple, rapide Séquences courtes, baseline
LSTM Gère dépendances longues, stable Séquences longues, standard NLP
GRU Plus rapide que LSTM, moins de paramètres Alternative à LSTM, contraintes compute
Bi-LSTM/GRU Context bidirectionnel Classification, pas génération

Handling Sequences

# Padding sequences (différentes longueurs) from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence # Sequences de longueurs variables seqs = [torch.randn(5, 100), torch.randn(8, 100), torch.randn(3, 100)] # Padding padded = pad_sequence(seqs, batch_first=True) # Shape: (3, 8, 100) - paddé à max length # Pack (efficace pour training) lengths = torch.tensor([5, 8, 3]) packed = pack_padded_sequence(padded, lengths, batch_first=True, enforce_sorted=False) # Forward avec packed output, hn = lstm(packed) # Unpack output, lengths = pad_packed_sequence(output, batch_first=True)
⚠️ Vanishing Gradient: RNN basiques souffrent de vanishing gradient sur longues séquences. LSTM/GRU résolvent ce problème. Pour séquences très longues (>500), considérez Transformer.

🤖 Transformer Basics

Self-Attention

Attention: Attention(Q,K,V) = softmax(QKT/√dk)V
Q=queries, K=keys, V=values, dk=dimension
import torch import torch.nn as nn import math class SelfAttention(nn.Module): def __init__(self, embed_dim): super().__init__() self.embed_dim = embed_dim # Projections Q, K, V self.query = nn.Linear(embed_dim, embed_dim) self.key = nn.Linear(embed_dim, embed_dim) self.value = nn.Linear(embed_dim, embed_dim) self.scale = math.sqrt(embed_dim) def forward(self, x, mask=None): # x: (batch, seq_len, embed_dim) batch_size, seq_len, _ = x.shape # Compute Q, K, V Q = self.query(x) # (batch, seq_len, embed_dim) K = self.key(x) V = self.value(x) # Attention scores scores = torch.bmm(Q, K.transpose(1, 2)) # (batch, seq_len, seq_len) scores = scores / self.scale # Apply mask (pour padding ou causal) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) # Attention weights attn_weights = torch.softmax(scores, dim=-1) # Apply attention à values output = torch.bmm(attn_weights, V) # (batch, seq_len, embed_dim) return output, attn_weights

Multi-Head Attention

Multi-Head: Concat(head₁, ..., headh)WO
où headi = Attention(QWiQ, KWiK, VWiV)
class MultiHeadAttention(nn.Module): def __init__(self, embed_dim, num_heads): super().__init__() assert embed_dim % num_heads == 0 self.embed_dim = embed_dim self.num_heads = num_heads self.head_dim = embed_dim // num_heads self.qkv = nn.Linear(embed_dim, embed_dim * 3) self.out = nn.Linear(embed_dim, embed_dim) self.scale = math.sqrt(self.head_dim) def forward(self, x, mask=None): batch_size, seq_len, embed_dim = x.shape # Compute Q, K, V pour tous les heads qkv = self.qkv(x) # (batch, seq_len, 3*embed_dim) qkv = qkv.reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim) qkv = qkv.permute(2, 0, 3, 1, 4) # (3, batch, heads, seq_len, head_dim) Q, K, V = qkv[0], qkv[1], qkv[2] # Attention scores scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) attn = torch.softmax(scores, dim=-1) # Apply attention output = torch.matmul(attn, V) # (batch, heads, seq_len, head_dim) # Concatenate heads output = output.transpose(1, 2) # (batch, seq_len, heads, head_dim) output = output.reshape(batch_size, seq_len, embed_dim) output = self.out(output) return output # PyTorch built-in mha = nn.MultiheadAttention( embed_dim=512, num_heads=8, dropout=0.1, batch_first=True ) output, attn_weights = mha(x, x, x) # Self-attention

Positional Encoding

Position:
PE(pos, 2i) = sin(pos/100002i/d)
PE(pos, 2i+1) = cos(pos/100002i/d)
class PositionalEncoding(nn.Module): def __init__(self, embed_dim, max_len=5000): super().__init__() # Créer matrice positional encoding pe = torch.zeros(max_len, embed_dim) position = torch.arange(0, max_len).unsqueeze(1).float() div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) # (1, max_len, embed_dim) self.register_buffer('pe', pe) def forward(self, x): # x: (batch, seq_len, embed_dim) seq_len = x.size(1) x = x + self.pe[:, :seq_len, :] return x # Alternative: Learned positional embeddings class LearnedPositionalEmbedding(nn.Module): def __init__(self, max_len, embed_dim): super().__init__() self.embedding = nn.Embedding(max_len, embed_dim) def forward(self, x): batch_size, seq_len, _ = x.shape positions = torch.arange(seq_len, device=x.device).unsqueeze(0) return x + self.embedding(positions)

Transformer Block

class TransformerBlock(nn.Module): def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1): super().__init__() # Multi-head attention self.attention = nn.MultiheadAttention( embed_dim, num_heads, dropout=dropout, batch_first=True ) # Feed-forward self.ff = nn.Sequential( nn.Linear(embed_dim, ff_dim), nn.GELU(), nn.Dropout(dropout), nn.Linear(ff_dim, embed_dim), nn.Dropout(dropout) ) # Layer norm self.norm1 = nn.LayerNorm(embed_dim) self.norm2 = nn.LayerNorm(embed_dim) self.dropout = nn.Dropout(dropout) def forward(self, x, mask=None): # Multi-head attention + residual + norm attn_output, _ = self.attention(x, x, x, attn_mask=mask) x = self.norm1(x + self.dropout(attn_output)) # Feed-forward + residual + norm ff_output = self.ff(x) x = self.norm2(x + ff_output) return x # Transformer complet class Transformer(nn.Module): def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ff_dim, max_len, num_classes): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim) self.pos_encoding = PositionalEncoding(embed_dim, max_len) self.layers = nn.ModuleList([ TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_layers) ]) self.classifier = nn.Linear(embed_dim, num_classes) def forward(self, x): # x: (batch, seq_len) x = self.embedding(x) # (batch, seq_len, embed_dim) x = self.pos_encoding(x) for layer in self.layers: x = layer(x) # Pooling (utiliser [CLS] token ou moyenne) x = x.mean(dim=1) # (batch, embed_dim) logits = self.classifier(x) return logits
Composant Rôle
Self-Attention Relations entre tokens
Multi-Head Différents aspects d'attention
Positional Encoding Information de position
Feed-Forward Transformation non-linéaire
Layer Norm Stabilisation training
Residual Connections Gradient flow, deep networks
🚀 Transformers: Remplacent RNN/LSTM dans la plupart des tâches NLP. Parallélisables (vs RNN séquentiel) et capturent mieux les dépendances longues. Base de BERT, GPT, etc.