scikit-learn fournit une API unifiée pour 50+ algorithmes. Apprenez le pipeline classique et entraînez votre premier modèle en 10 lignes.
from sklearn.model_selection import train_test_split
X = df.drop(columns=['prix'])
y = df['prix']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.30, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_test, y_test, test_size=0.50, random_state=42
)
# Final : 70% train, 15% val, 15% test
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # LabelEncoder : pour cibles ordinales (low/medium/high) le = LabelEncoder() df['niveau_enc'] = le.fit_transform(df['niveau']) # OneHotEncoder : pour features non ordinales (région, produit) df_encoded = pd.get_dummies(df, columns=['region', 'produit'], drop_first=True)
from sklearn.preprocessing import StandardScaler, MinMaxScaler scaler = StandardScaler() # moyenne 0, écart-type 1 X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # PAS de fit_transform sur test !
fit_transform sur train, et seulement transform sur test/val. Sinon vous causez du data leakage.
from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from xgboost import XGBRegressor # Régression linéaire (baseline) lr = LinearRegression() lr.fit(X_train, y_train) pred = lr.predict(X_test) # Random Forest rf = RandomForestRegressor(n_estimators=100, random_state=42) rf.fit(X_train, y_train) # XGBoost (souvent gagnant en compétition) xgb = XGBRegressor(n_estimators=200, learning_rate=0.05) xgb.fit(X_train, y_train)
from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier logreg = LogisticRegression(max_iter=1000) knn = KNeighborsClassifier(n_neighbors=5) tree = DecisionTreeClassifier(max_depth=5)
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
rmse = np.sqrt(mean_squared_error(y_test, pred))
r2 = r2_score(y_test, pred)
print(f'RMSE : {rmse:.0f} | R² : {r2:.3f}')
from sklearn.metrics import (accuracy_score, f1_score,
roc_auc_score, confusion_matrix,
classification_report)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
print(f'AUC-ROC : {roc_auc_score(y_test, proba):.3f}')
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X, y, cv=5, scoring='r2')
print(f'R² moyen : {scores.mean():.3f} ± {scores.std():.3f}')
from sklearn.model_selection import GridSearchCV
params = {
'n_estimators': [100, 200, 500],
'max_depth': [5, 10, 20, None]
}
grid = GridSearchCV(RandomForestRegressor(), params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)
# Optuna (plus moderne, recherche bayésienne)
import optuna
def objective(trial):
n = trial.suggest_int('n_estimators', 100, 1000)
d = trial.suggest_int('max_depth', 3, 30)
model = RandomForestRegressor(n_estimators=n, max_depth=d)
return cross_val_score(model, X, y, cv=3).mean()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh', figsize=(8,6))
plt.title('Feature importance')
import joblib
joblib.dump(rf, 'modele_loyer.joblib')
# Plus tard, en prod
model = joblib.load('modele_loyer.joblib')
prediction = model.predict([[80, 3, 5, 'Bastos']])