Déploiement, sécurité, compliance, FinOps et gouvernance Gemini
AI Studio
Vertex AI
# 1. Exporter prompt AI Studio
prompt = "Votre prompt optimisé"
# 2. Créer client Vertex AI
from vertexai.generative_models import GenerativeModel
model = GenerativeModel("gemini-2.0-flash-exp")
response = model.generate_content(prompt)
# 3. Ajouter config production
generation_config = {
"temperature": 0.7,
"max_output_tokens": 2048,
"top_p": 0.95
}
# 1. Créer projet gcloud projects create my-gemini-prod \ --name="Gemini Production" # 2. Lier compte billing gcloud billing projects link my-gemini-prod \ --billing-account=BILLING_ACCOUNT_ID # 3. Activer APIs gcloud services enable aiplatform.googleapis.com \ compute.googleapis.com \ storage.googleapis.com \ --project=my-gemini-prod
# Vérifier quotas actuels gcloud compute project-info describe \ --project=my-gemini-prod # Quotas par défaut Gemini 2.0: # - Flash: 2000 RPM, 4M TPM # - Pro: 1000 RPM, 4M TPM # - Flash-8B: 4000 RPM, 8M TPM # Demander augmentation quota gcloud alpha services quota update \ --service=aiplatform.googleapis.com \ --consumer=projects/my-gemini-prod \ --metric=GenerateContentRequests \ --value=10000
# Dockerfile FROM python:3.11-slim WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY . . CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] # Déployer gcloud run deploy gemini-api \ --source . \ --region us-central1 \ --allow-unauthenticated \ --memory 2Gi \ --cpu 2 \ --timeout 300s
gcloud run services update gemini-api \ --min-instances 1 \ --max-instances 100 \ --concurrency 80 \ --cpu-throttling \ --execution-environment gen2 # Parametres: # - min-instances: Évite cold start # - max-instances: Contrôle coûts # - concurrency: Requêtes/instance # - cpu-throttling: CPU alloué seulement si requête
# 1. Créer SA dédié gcloud iam service-accounts create gemini-runner \ --display-name="Cloud Run Gemini Service" # 2. Attribuer rôles gcloud projects add-iam-policy-binding PROJECT_ID \ --member="serviceAccount:gemini-runner@PROJECT_ID.iam.gserviceaccount.com" \ --role="roles/aiplatform.user" # 3. Lier au service Cloud Run gcloud run services update gemini-api \ --service-account=gemini-runner@PROJECT_ID.iam.gserviceaccount.com
# GKE Autopilot (recommandé pour prod) gcloud container clusters create-auto gemini-cluster \ --region us-central1 \ --release-channel regular \ --enable-private-nodes \ --enable-private-endpoint \ --workload-pool=PROJECT_ID.svc.id.goog # Avantages Autopilot: # - Gestion automatique des nodes # - Sécurité renforcée par défaut # - Facturation au pod (vs node)
apiVersion: apps/v1
kind: Deployment
metadata:
name: gemini-api
spec:
replicas: 3
selector:
matchLabels:
app: gemini-api
template:
metadata:
labels:
app: gemini-api
spec:
serviceAccountName: gemini-ksa
containers:
- name: api
image: gcr.io/PROJECT_ID/gemini-api:latest
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
env:
- name: GCP_PROJECT
value: "my-gemini-prod"
# Service
apiVersion: v1
kind: Service
metadata:
name: gemini-api-svc
spec:
selector:
app: gemini-api
ports:
- port: 80
targetPort: 8080
type: ClusterIP
# Ingress (HTTPS + Load Balancer)
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: gemini-ingress
annotations:
kubernetes.io/ingress.class: "gce"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
spec:
tls:
- hosts:
- api.example.com
secretName: gemini-tls
rules:
- host: api.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: gemini-api-svc
port:
number: 80
# 1. Créer Kubernetes SA kubectl create serviceaccount gemini-ksa -n default # 2. Créer GCP SA gcloud iam service-accounts create gemini-gsa # 3. Lier KSA → GSA gcloud iam service-accounts add-iam-policy-binding \ gemini-gsa@PROJECT_ID.iam.gserviceaccount.com \ --role roles/iam.workloadIdentityUser \ --member "serviceAccount:PROJECT_ID.svc.id.goog[default/gemini-ksa]" # 4. Annoter KSA kubectl annotate serviceaccount gemini-ksa \ iam.gke.io/gcp-service-account=gemini-gsa@PROJECT_ID.iam.gserviceaccount.com
# ❌ Mauvais: rôle trop large gcloud projects add-iam-policy-binding PROJECT_ID \ --member="serviceAccount:app@PROJECT_ID.iam.gserviceaccount.com" \ --role="roles/editor" # ✅ Bon: rôles granulaires gcloud projects add-iam-policy-binding PROJECT_ID \ --member="serviceAccount:app@PROJECT_ID.iam.gserviceaccount.com" \ --role="roles/aiplatform.user" gcloud projects add-iam-policy-binding PROJECT_ID \ --member="serviceAccount:app@PROJECT_ID.iam.gserviceaccount.com" \ --role="roles/storage.objectViewer"
# Activer Data Access Logs pour Vertex AI auditConfigs: - service: aiplatform.googleapis.com auditLogConfigs: - logType: ADMIN_READ - logType: DATA_READ - logType: DATA_WRITE # Requêtes Gemini loggées: # - Qui a appelé l'API # - Quel modèle utilisé # - Timestamp et région # - Prompt (si DATA_READ activé, attention GDPR)
# Restreindre accès par IP
gcloud projects add-iam-policy-binding PROJECT_ID \
--member="serviceAccount:app@PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/aiplatform.user" \
--condition='
expression=origin.ip in ["203.0.113.0/24"],
title=allow-office-ip,
description=Accès uniquement depuis bureau'
# Restreindre par horaire
expression='
request.time.getHours("Europe/Paris") >= 9 &&
request.time.getHours("Europe/Paris") <= 18'
# 1. Créer périmètre gcloud access-context-manager perimeters create gemini_perimeter \ --title="Gemini Production Perimeter" \ --resources=projects/123456789 \ --restricted-services=aiplatform.googleapis.com,storage.googleapis.com \ --policy=POLICY_ID # 2. Règle ingress (autoriser CI/CD) gcloud access-context-manager perimeters update gemini_perimeter \ --add-ingress-policies=ingress.yaml # ingress.yaml ingressFrom: sources: - resource: projects/CICD_PROJECT_ID ingressTo: operations: - serviceName: aiplatform.googleapis.com
# 1. Créer keyring Cloud KMS gcloud kms keyrings create gemini-keyring \ --location us-central1 # 2. Créer clé de chiffrement gcloud kms keys create gemini-key \ --keyring gemini-keyring \ --location us-central1 \ --purpose encryption # 3. Attribuer rôle à Vertex AI SA gcloud kms keys add-iam-policy-binding gemini-key \ --keyring gemini-keyring \ --location us-central1 \ --member serviceAccount:service-PROJECT_NUMBER@gcp-sa-aiplatform.iam.gserviceaccount.com \ --role roles/cloudkms.cryptoKeyEncrypterDecrypter
from google.cloud import dlp_v2
def redact_pii(text):
dlp = dlp_v2.DlpServiceClient()
inspect_config = {
"info_types": [
{"name": "EMAIL_ADDRESS"},
{"name": "PHONE_NUMBER"},
{"name": "PERSON_NAME"}
]
}
deidentify_config = {
"info_type_transformations": {
"transformations": [{
"primitive_transformation": {
"replace_with_info_type_config": {}
}
}]
}
}
response = dlp.deidentify_content(
request={
"parent": f"projects/{PROJECT_ID}",
"deidentify_config": deidentify_config,
"inspect_config": inspect_config,
"item": {"value": text}
}
)
return response.item.value
# Retention logs Cloud Logging (GDPR: max 30j si PII)
gcloud logging sinks update gemini-logs \
--log-filter='resource.type="aiplatform.googleapis.com/Endpoint"' \
--retention-days=30
# Suppression données Cloud Storage
gsutil lifecycle set lifecycle.json gs://gemini-data
# lifecycle.json
{
"lifecycle": {
"rule": [{
"action": {"type": "Delete"},
"condition": {"age": 90}
}]
}
}
# cloudbuild.yaml steps: # 1. Run tests - name: 'python:3.11' entrypoint: 'pip' args: ['install', '-r', 'requirements.txt'] - name: 'python:3.11' entrypoint: 'pytest' args: ['tests/'] # 2. Build Docker image - name: 'gcr.io/cloud-builders/docker' args: ['build', '-t', 'gcr.io/$PROJECT_ID/gemini-api:$SHORT_SHA', '.'] # 3. Push to GCR - name: 'gcr.io/cloud-builders/docker' args: ['push', 'gcr.io/$PROJECT_ID/gemini-api:$SHORT_SHA'] # 4. Deploy to Cloud Run - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' entrypoint: 'gcloud' args: - 'run' - 'deploy' - 'gemini-api' - '--image=gcr.io/$PROJECT_ID/gemini-api:$SHORT_SHA' - '--region=us-central1'
# prompts/v1.0.0/summary.txt
Tu es un assistant de synthèse...
# prompts/v1.1.0/summary.txt
Tu es un assistant de synthèse expert...
# Code charge prompt versionné
def load_prompt(version="latest"):
if version == "latest":
version = get_latest_version()
path = f"prompts/v{version}/summary.txt"
with open(path) as f:
return f.read()
# Git tags pour releases
git tag -a prompt-v1.1.0 -m "Amélioration prompts résumé"
git push origin prompt-v1.1.0
# tests/eval_prompts.py
import vertexai
from vertexai.generative_models import GenerativeModel
def test_summary_quality():
model = GenerativeModel("gemini-2.0-flash-exp")
test_cases = [
{"input": "Long text...", "expected_length": 100},
{"input": "Another text...", "expected_length": 150}
]
for case in test_cases:
response = model.generate_content(
f"Résume en {case['expected_length']} mots: {case['input']}"
)
word_count = len(response.text.split())
# Gate: échec si > 20% écart
assert abs(word_count - case['expected_length']) < case['expected_length'] * 0.2
# Intégration CI/CD
# Si tests échouent → déploiement bloqué
from kfp import dsl
from kfp.dsl import component
@component(base_image="python:3.11")
def evaluate_model(model_name: str) -> float:
# Évaluation automatique
import vertexai
from vertexai.generative_models import GenerativeModel
model = GenerativeModel(model_name)
# Run eval suite...
return accuracy_score
@dsl.pipeline(name="gemini-eval-pipeline")
def eval_pipeline():
eval_task = evaluate_model(model_name="gemini-2.0-flash-exp")
# Si accuracy < 0.8 → alerte
with dsl.Condition(eval_task.output < 0.8):
slack_alert_task()
# Trigger pipeline chaque nuit
| Modèle | Input | Output | Cache | Thinking | |-------------------|-----------|-----------|----------|----------| | Pro (exp) | $1.25/1M | $5.00/1M | $0.31/1M | $5.00/1M | | Flash (exp) | $0.075/1M | $0.30/1M | $0.019/1M| $0.30/1M | | Flash-8B | $0.0375/1M| $0.15/1M | $0.009/1M| N/A | | Flash-Lite | GRATUIT* | GRATUIT* | N/A | N/A | | Flash-Thinking | $0.075/1M | $0.30/1M | $0.019/1M| $0.30/1M | | Pro-Thinking | $1.25/1M | $5.00/1M | $0.31/1M | $5.00/1M | | 1.5 Pro | $1.25/1M | $5.00/1M | $0.31/1M | N/A | | 1.5 Flash | $0.075/1M | $0.30/1M | $0.019/1M| N/A | *Flash-Lite: Gratuit jusqu'à 15 RPM, puis $0.02/$0.08 (free tier)
def calculate_monthly_cost(
requests_per_day: int,
avg_input_tokens: int,
avg_output_tokens: int,
model: str = "flash"
):
pricing = {
"flash": {"input": 0.075, "output": 0.30},
"pro": {"input": 1.25, "output": 5.00},
"flash-8b": {"input": 0.0375, "output": 0.15}
}
monthly_requests = requests_per_day * 30
input_cost = (monthly_requests * avg_input_tokens / 1_000_000) * pricing[model]["input"]
output_cost = (monthly_requests * avg_output_tokens / 1_000_000) * pricing[model]["output"]
return input_cost + output_cost
# Exemple: 10K requêtes/jour, 1K input, 500 output
cost = calculate_monthly_cost(10_000, 1_000, 500, "flash")
# Flash: $38.25/mois
def route_request(prompt: str, complexity: str):
# Règles métier
if complexity == "simple" or len(prompt) < 100:
return "gemini-2.0-flash-lite" # GRATUIT
elif complexity == "medium":
return "gemini-2.0-flash-exp" # $0.075/1M
else:
return "gemini-2.5-pro-exp" # $1.25/1M
# Classifier automatique
def classify_complexity(prompt: str) -> str:
classifier = GenerativeModel("gemini-2.0-flash-8b")
response = classifier.generate_content(
f"Classe complexité (simple/medium/complex): {prompt[:200]}"
)
return response.text.strip().lower()
from vertexai.preview import caching
# Créer cache (TTL 1h)
cached_content = caching.CachedContent.create(
model_name="gemini-2.0-flash-exp",
contents=[large_document], # 500K tokens
ttl="3600s"
)
# Utiliser cache (coût: $0.019/1M vs $0.075/1M)
model = GenerativeModel.from_cached_content(cached_content)
response = model.generate_content("Résume le document")
# ROI: Cache payé après 4 requêtes
# (0.019 + 3×0.019 = 0.076) < (4×0.075 = 0.30)
# Batch JSONL (traitement asynchrone -50%)
batch_requests = [
{"custom_id": "req1", "contents": "Summarize A"},
{"custom_id": "req2", "contents": "Summarize B"},
# ... 1000 requêtes
]
# Submit batch
job = client.batches.create(
input_file_id=upload_jsonl(batch_requests),
endpoint="/v1/models/gemini-2.0-flash-exp:generateContent",
completion_window="24h"
)
# Coût: $0.0375/1M input (vs $0.075 online)
# Latence: 5min - 24h (acceptable pour analytics)
# Exemple combiné
generation_config = {
"max_output_tokens": 150, # Technique 7
"temperature": 0.3, # Réduit variabilité
"response_mime_type": "text/plain" # Pas de JSON overhead
}
# TTL court (1h) pour données volatiles
cache_1h = caching.CachedContent.create(
model_name="gemini-2.0-flash-exp",
contents=[news_articles],
ttl="3600s" # 1h
)
# TTL long (24h) pour référentiels statiques
cache_24h = caching.CachedContent.create(
model_name="gemini-2.0-flash-exp",
contents=[product_catalog],
ttl="86400s" # 24h
)
# Formule ROI: break_even = cache_cost / (online_cost - cache_cost)
# Flash 100K tokens: 0.0019 / (0.0075 - 0.0019) = 0.34 requêtes
# → Rentable dès 1 requête
# Pattern 1: Warm on deployment
def warm_cache_on_startup():
frequently_used = [
load_document("company_policies.txt"),
load_document("product_specs.txt")
]
for doc in frequently_used:
caching.CachedContent.create(
model_name="gemini-2.0-flash-exp",
contents=[doc],
ttl="86400s",
display_name=f"cache-{doc.name}"
)
# Pattern 2: Refresh before expiry
def refresh_cache_cron():
caches = caching.CachedContent.list()
for cache in caches:
if cache.expire_time - now() < 3600: # < 1h
cache.update(ttl="86400s") # Reset TTL
def cache_roi(
tokens: int,
requests_per_hour: int,
ttl_hours: int,
model: str = "flash"
):
pricing = {
"flash": {"online": 0.075, "cache": 0.019},
"pro": {"online": 1.25, "cache": 0.31}
}
# Coût sans cache
cost_no_cache = (tokens / 1_000_000) * pricing[model]["online"] * requests_per_hour * ttl_hours
# Coût avec cache
cache_creation = (tokens / 1_000_000) * pricing[model]["cache"]
cache_reads = (tokens / 1_000_000) * pricing[model]["cache"] * requests_per_hour * ttl_hours
cost_with_cache = cache_creation + cache_reads
savings = cost_no_cache - cost_with_cache
roi_percent = (savings / cost_no_cache) * 100
return {"savings": savings, "roi_percent": roi_percent}
# Flash 500K tokens, 10 req/h, 24h
print(cache_roi(500_000, 10, 24))
# {'savings': $8.55, 'roi_percent': 95%}
from vertexai.generative_models import GenerativeModel
def classify_request(prompt: str) -> dict:
"""Classify request complexity using Flash-8B"""
classifier = GenerativeModel("gemini-2.0-flash-8b")
classification_prompt = f"""
Analyse cette requête et retourne JSON:
{{
"complexity": "simple|medium|complex",
"domain": "general|technical|creative",
"estimated_tokens": 100
}}
Requête: {prompt[:500]}
"""
response = classifier.generate_content(
classification_prompt,
generation_config={"response_mime_type": "application/json"}
)
return json.loads(response.text)
def route_to_model(prompt: str, classification: dict) -> str:
"""3-tier routing: Flash-Lite → Flash → Pro"""
# Tier 1: Flash-Lite (GRATUIT)
if (classification["complexity"] == "simple" and
classification["estimated_tokens"] < 200 and
classification["domain"] == "general"):
return "gemini-2.0-flash-lite-exp"
# Tier 2: Flash ($0.075/1M)
elif (classification["complexity"] in ["simple", "medium"] or
classification["domain"] == "general"):
return "gemini-2.0-flash-exp"
# Tier 3: Pro ($1.25/1M)
else:
return "gemini-2.5-pro-exp"
# Économie moyenne: 70% vs "all Pro"
async def generate_with_fallback(prompt: str):
"""Try Flash-Lite → Flash → Pro with fallbacks"""
models = [
("gemini-2.0-flash-lite-exp", 0),
("gemini-2.0-flash-exp", 1),
("gemini-2.5-pro-exp", 2)
]
for model_name, attempt in models:
try:
model = GenerativeModel(model_name)
response = await model.generate_content_async(prompt)
# Quality check
if is_response_adequate(response):
log_routing_decision(model_name, attempt, "success")
return response
except Exception as e:
log_routing_decision(model_name, attempt, f"failed: {e}")
continue
raise Exception("All models failed")
# BigQuery schema pour analytics routing CREATE TABLE routing_logs ( timestamp TIMESTAMP, request_id STRING, prompt_hash STRING, classification JSON, model_used STRING, tokens_input INT64, tokens_output INT64, cost_usd FLOAT64, latency_ms INT64, quality_score FLOAT64 ) # Query: coût par tier SELECT model_used, COUNT(*) as requests, SUM(cost_usd) as total_cost, AVG(quality_score) as avg_quality FROM routing_logs WHERE DATE(timestamp) = CURRENT_DATE() GROUP BY model_used ORDER BY total_cost DESC
# 1. Créer fichier JSONL
import jsonlines
batch_requests = []
for user_id, data in dataset.items():
batch_requests.append({
"custom_id": f"user-{user_id}",
"method": "POST",
"url": "/v1/models/gemini-2.0-flash-exp:generateContent",
"body": {
"contents": [{
"parts": [{"text": f"Analyse: {data}"}]
}]
}
})
with jsonlines.open("batch_input.jsonl", "w") as f:
f.write_all(batch_requests)
# 2. Upload to GCS
gsutil cp batch_input.jsonl gs://my-bucket/batches/
from google.cloud import aiplatform
# Submit batch job
batch_job = aiplatform.BatchPredictionJob.create(
job_display_name="daily-summaries",
model_name="gemini-2.0-flash-exp",
input_gcs_uri="gs://my-bucket/batches/batch_input.jsonl",
output_gcs_uri="gs://my-bucket/batches/output/",
instances_format="jsonl",
predictions_format="jsonl"
)
# Monitor status
while batch_job.state != "JOB_STATE_SUCCEEDED":
print(f"Status: {batch_job.state}, Progress: {batch_job.progress}%")
time.sleep(60)
# Download results
gsutil cp gs://my-bucket/batches/output/* ./results/
from openai import OpenAI
# Vertex AI compatible avec SDK OpenAI
client = OpenAI(
base_url=f"https://{REGION}-aiplatform.googleapis.com/v1/projects/{PROJECT}/locations/{REGION}/endpoints/openapi",
api_key=get_access_token()
)
# Create batch
batch = client.batches.create(
input_file_id=file_id,
endpoint="/v1/chat/completions",
completion_window="24h"
)
# Check status
batch = client.batches.retrieve(batch.id)
print(f"Status: {batch.status}, Output: {batch.output_file_id}")
# Download results
results = client.files.content(batch.output_file_id)
from google.cloud import billing_v1
def get_daily_cost(project_id: str, date: str):
client = billing_v1.CloudBillingClient()
# Query coût par service
query = f"""
SELECT
service.description,
SUM(cost) as total_cost
FROM `{project_id}.billing_export.gcp_billing_export_v1_*`
WHERE _TABLE_SUFFIX = FORMAT_DATE('%Y%m%d', DATE '{date}')
AND service.description LIKE '%Vertex AI%'
GROUP BY service.description
"""
# Résultat: Vertex AI Prediction = $123.45
return run_bigquery(query)
# Terraform: Créer budget alert
resource "google_billing_budget" "gemini_budget" {
billing_account = var.billing_account
display_name = "Gemini Monthly Budget"
budget_filter {
projects = ["projects/${var.project_id}"]
services = ["services/aiplatform.googleapis.com"]
}
amount {
specified_amount {
units = "1000" # $1000/mois
}
}
threshold_rules {
threshold_percent = 0.5 # Alert à 50%
}
threshold_rules {
threshold_percent = 0.9 # Alert à 90%
}
threshold_rules {
threshold_percent = 1.0 # Alert à 100%
spend_basis = "FORECASTED_SPEND"
}
}
# BigQuery view pour dashboard CREATE VIEW cost_analysis AS SELECT TIMESTAMP_TRUNC(usage_start_time, DAY) as date, project.id as project_id, service.description as service, sku.description as sku, SUM(usage.amount) as usage_amount, usage.unit as usage_unit, SUM(cost) as cost_usd, SUM(cost) / NULLIF(SUM(usage.amount), 0) as cost_per_unit FROM `billing_export.gcp_billing_export_v1_*` WHERE service.description = 'Vertex AI' GROUP BY 1,2,3,4,6 # Métriques Looker: # - Coût journalier par modèle # - Tokens/$ ratio # - Top 10 endpoints par coût
# Ajouter labels à chaque requête
from vertexai.generative_models import GenerativeModel
model = GenerativeModel(
"gemini-2.0-flash-exp",
# Labels pour attribution coût
labels={
"team": "data-science",
"project": "customer-insights",
"env": "production"
}
)
# Query coût par label
SELECT
labels.value as team,
SUM(cost) as team_cost
FROM billing_export
WHERE labels.key = 'team'
GROUP BY team
ORDER BY team_cost DESC
# Résultat: data-science = $450, engineering = $200
# config/models.yaml
models:
production:
primary: "gemini-2.0-flash-001" # Stable
fallback: "gemini-1.5-flash-002"
staging:
primary: "gemini-2.0-flash-exp" # Testing latest
# Code charge config
import yaml
def get_model(env: str = "production"):
with open("config/models.yaml") as f:
config = yaml.safe_load(f)
return config["models"][env]["primary"]
# Permet swap rapide si issue
model_name = get_model(os.getenv("ENV", "production"))
# Exemple: Migration 2.0 → 2.5
# 1. Annonce: Jan 2026
# 2. Sunset: Jul 2026
# Script détection usage deprecated models
from google.cloud import logging_v2
def find_deprecated_usage():
client = logging_v2.Client()
filter_str = '''
resource.type="aiplatform.googleapis.com/Endpoint"
jsonPayload.model_name:"gemini-2.0"
'''
entries = client.list_entries(filter_=filter_str, max_results=100)
deprecated_calls = {}
for entry in entries:
model = entry.payload.get("model_name")
deprecated_calls[model] = deprecated_calls.get(model, 0) + 1
return deprecated_calls
# Alert Slack si usage détecté
# docs/adr/0003-model-selection.md # ADR 0003: Choix Gemini 2.0 Flash pour Production ## Status Accepted (2026-02-01) ## Context Besoin génération résumés 1M requêtes/jour, budget $500/mois ## Decision Utiliser Gemini 2.0 Flash-exp avec fallback 1.5 Flash stable ## Consequences - Positif: Coût $300/mois (-40% vs Pro) - Positif: Latence <2s (vs 5s Pro) - Négatif: Qualité -5% vs Pro (acceptable métier) - Risk: -exp peut changer → monitoring qualité ## Alternatives Considered - Pro 2.5: Trop cher ($1800/mois) - Flash-8B: Qualité insuffisante (-15%)
# permissions.yaml
tools:
read_database:
allowed_agents: ["customer-support", "analytics"]
requires_approval: false
send_email:
allowed_agents: ["customer-support"]
requires_approval: true # Human-in-the-loop
max_daily_calls: 100
delete_user:
allowed_agents: [] # Aucun agent autorisé
requires_approval: true
approvers: ["admin@company.com"]
# Enforcement dans agent runtime
def execute_tool(agent_id, tool_name, params):
if not is_tool_allowed(agent_id, tool_name):
raise PermissionError(f"{agent_id} not allowed to use {tool_name}")
# BigQuery schema audit logs agents CREATE TABLE agent_audit_logs ( timestamp TIMESTAMP, agent_id STRING, session_id STRING, user_id STRING, tool_called STRING, tool_params JSON, tool_result JSON, human_approval BOOLEAN, approver_email STRING, cost_usd FLOAT64 ) # Query: Tools les plus appelés SELECT tool_called, COUNT(*) as call_count, SUM(cost_usd) as total_cost, AVG(IF(tool_result.status='success', 1, 0)) as success_rate FROM agent_audit_logs WHERE DATE(timestamp) >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY) GROUP BY tool_called ORDER BY call_count DESC LIMIT 10
# marketplace/customer-support-agent.yaml name: "Customer Support Agent v2.1" description: "Handle customer inquiries" tools: [read_faq, search_orders, send_email] avg_cost_per_session: "$0.15" sla_response_time: "< 5s" rating: 4.7/5
// Android Kotlin
import com.google.ai.edge.aicore.GenerativeModel
class ChatActivity : AppCompatActivity() {
private lateinit var model: GenerativeModel
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
// Charger Gemma Nano (stocké localement)
model = GenerativeModel(
modelName = "gemma-nano-3b",
apiKey = null // Pas besoin API key, tout local
)
}
suspend fun generateResponse(prompt: String): String {
val response = model.generateContent(prompt)
return response.text
// Latence: 50-100ms, pas de coût API, offline OK
}
}
from google.cloud import aiplatform
# 1. Préparer dataset (JSONL)
training_data = [
{"input": "Question client...", "output": "Réponse..."},
# ... 1000+ exemples
]
# 2. Upload to GCS
upload_to_gcs(training_data, "gs://my-bucket/gemma-training.jsonl")
# 3. Launch fine-tuning job
tuning_job = aiplatform.PipelineJob(
display_name="gemma-3-9b-customer-support",
template_path="gemma-3-9b-tuning-template",
parameter_values={
"model_name": "gemma-3-9b",
"training_data": "gs://my-bucket/gemma-training.jsonl",
"epochs": 3,
"learning_rate": 0.0001
}
)
# Coût: ~$50 pour 1000 exemples, 3 epochs
# Analyser activation feature
from gemma_scope import GemmaScope
scope = GemmaScope.load("gemma-3-9b")
activations = scope.analyze("Le client est très satisfait")
# Feature #42531: 0.89 (high activation)
# Gemini dans Gmail, Docs, Sheets, Slides # Gmail: "Help me write" - Prompt: "Draft email declining meeting politely" - Output: Email complet généré # Docs: "Help me organize" - Sélectionner texte brut → "Create outline" - Output: Structure hiérarchique # Sheets: "Help me analyze" - Sélectionner données → "Create pivot table" - Output: Tableau croisé dynamique + insights # Pricing: $30/user/mois (Workspace Business)
# Vision: LLM génère directement UI interactive # Prompt "Crée dashboard ventes Q4 avec filtres région" # Output: Pas markdown, mais composants React# Frameworks émergents: # - Vercel AI SDK (generateUI) # - Anthropic Artifacts (Claude) # - Google en développement (Gemini + Material Design)
# Avant mise en production ## Infrastructure ☐ Cloud Run/GKE configuré avec auto-scaling ☐ Service Account avec least privilege ☐ VPC-SC activé si données sensibles ☐ CMEK configuré si compliance requis ## FinOps ☐ Budget alerts configurés (50%, 90%, 100%) ☐ Context caching activé (TTL optimisé) ☐ Model routing implémenté ☐ Cost attribution avec labels ## Monitoring ☐ Cloud Logging capture toutes requêtes ☐ Dashboard coût temps réel ☐ Alertes qualité modèle (accuracy < seuil) ## Compliance ☐ DPA signé avec Google ☐ DPIA complétée si GDPR ☐ Audit logs activés (retention 90j)