Tabular Classification
Scikit-learn
Joblib
postgresql
sql
query-cache
plan-cache
redis
database
tabular-regression
Instructions to use nilenpatel/pg-plan-cache-models with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use nilenpatel/pg-plan-cache-models with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("nilenpatel/pg-plan-cache-models", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| Train all three pg_plan_cache models: | |
| 1. SQL Cache Advisor (classification: low / medium / high) | |
| 2. Cache TTL Recommender (regression: seconds) | |
| 3. Query Complexity Estimator (regression: 1-100 score) | |
| Saves trained models as joblib files in the ./trained/ directory. | |
| """ | |
| import os | |
| import json | |
| import numpy as np | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.metrics import classification_report, mean_absolute_error, r2_score | |
| from sklearn.preprocessing import LabelEncoder | |
| import joblib | |
| from features import extract_features, FEATURE_NAMES | |
| from dataset import generate_dataset | |
| OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "trained") | |
| def train(): | |
| print("=" * 60) | |
| print(" pg_plan_cache β Model Training") | |
| print("=" * 60) | |
| # ββ Generate data βββββββββββββββββββββββββββββββββββββββββ | |
| print("\n[1/5] Generating synthetic training data...") | |
| queries, benefits, ttls, complexities = generate_dataset(n=8000, seed=42) | |
| print(f" Generated {len(queries)} samples") | |
| # ββ Extract features ββββββββββββββββββββββββββββββββββββββ | |
| print("\n[2/5] Extracting features...") | |
| X = np.array([extract_features(q) for q in queries]) | |
| print(f" Feature matrix: {X.shape}") | |
| # ββ Encode labels βββββββββββββββββββββββββββββββββββββββββ | |
| le = LabelEncoder() | |
| y_benefit = le.fit_transform(benefits) # low=1, medium=2, high=0 | |
| y_ttl = np.array(ttls, dtype=float) | |
| y_complexity = np.array(complexities, dtype=float) | |
| # ββ Split βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| X_train, X_test, yb_train, yb_test, yt_train, yt_test, yc_train, yc_test = \ | |
| train_test_split(X, y_benefit, y_ttl, y_complexity, test_size=0.2, random_state=42) | |
| print(f" Train: {len(X_train)}, Test: {len(X_test)}") | |
| # ββ Model 1: Cache Advisor (classification) βββββββββββββββ | |
| print("\n[3/5] Training SQL Cache Advisor...") | |
| clf = RandomForestClassifier( | |
| n_estimators=200, | |
| max_depth=15, | |
| min_samples_split=5, | |
| min_samples_leaf=2, | |
| random_state=42, | |
| n_jobs=-1, | |
| ) | |
| clf.fit(X_train, yb_train) | |
| yb_pred = clf.predict(X_test) | |
| print("\n Classification Report:") | |
| report = classification_report(yb_test, yb_pred, target_names=le.classes_) | |
| print(" " + report.replace("\n", "\n ")) | |
| cv_scores = cross_val_score(clf, X, y_benefit, cv=5, scoring="accuracy") | |
| print(f" Cross-val accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})") | |
| # ββ Model 2: TTL Recommender (regression) βββββββββββββββββ | |
| print("\n[4/5] Training Cache TTL Recommender...") | |
| reg_ttl = GradientBoostingRegressor( | |
| n_estimators=200, | |
| max_depth=8, | |
| learning_rate=0.1, | |
| min_samples_split=5, | |
| random_state=42, | |
| ) | |
| reg_ttl.fit(X_train, yt_train) | |
| yt_pred = reg_ttl.predict(X_test) | |
| mae_ttl = mean_absolute_error(yt_test, yt_pred) | |
| r2_ttl = r2_score(yt_test, yt_pred) | |
| print(f" MAE: {mae_ttl:.1f} seconds") | |
| print(f" R2: {r2_ttl:.3f}") | |
| # ββ Model 3: Complexity Estimator (regression) ββββββββββββ | |
| print("\n[5/5] Training Query Complexity Estimator...") | |
| reg_cplx = GradientBoostingRegressor( | |
| n_estimators=200, | |
| max_depth=8, | |
| learning_rate=0.1, | |
| min_samples_split=5, | |
| random_state=42, | |
| ) | |
| reg_cplx.fit(X_train, yc_train) | |
| yc_pred = reg_cplx.predict(X_test) | |
| mae_cplx = mean_absolute_error(yc_test, yc_pred) | |
| r2_cplx = r2_score(yc_test, yc_pred) | |
| print(f" MAE: {mae_cplx:.1f} points") | |
| print(f" R2: {r2_cplx:.3f}") | |
| # ββ Save models βββββββββββββββββββββββββββββββββββββββββββ | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| joblib.dump(clf, os.path.join(OUTPUT_DIR, "cache_advisor.joblib")) | |
| joblib.dump(reg_ttl, os.path.join(OUTPUT_DIR, "ttl_recommender.joblib")) | |
| joblib.dump(reg_cplx, os.path.join(OUTPUT_DIR, "complexity_estimator.joblib")) | |
| joblib.dump(le, os.path.join(OUTPUT_DIR, "label_encoder.joblib")) | |
| # Feature importances | |
| importances = { | |
| "cache_advisor": dict(zip(FEATURE_NAMES, clf.feature_importances_.tolist())), | |
| "ttl_recommender": dict(zip(FEATURE_NAMES, reg_ttl.feature_importances_.tolist())), | |
| "complexity_estimator": dict(zip(FEATURE_NAMES, reg_cplx.feature_importances_.tolist())), | |
| } | |
| with open(os.path.join(OUTPUT_DIR, "feature_importances.json"), "w") as f: | |
| json.dump(importances, f, indent=2) | |
| # Model metadata | |
| metadata = { | |
| "models": { | |
| "cache_advisor": { | |
| "type": "RandomForestClassifier", | |
| "task": "classification", | |
| "classes": le.classes_.tolist(), | |
| "accuracy_cv5": round(float(cv_scores.mean()), 4), | |
| }, | |
| "ttl_recommender": { | |
| "type": "GradientBoostingRegressor", | |
| "task": "regression", | |
| "unit": "seconds", | |
| "mae": round(float(mae_ttl), 2), | |
| "r2": round(float(r2_ttl), 4), | |
| }, | |
| "complexity_estimator": { | |
| "type": "GradientBoostingRegressor", | |
| "task": "regression", | |
| "unit": "score (1-100)", | |
| "mae": round(float(mae_cplx), 2), | |
| "r2": round(float(r2_cplx), 4), | |
| }, | |
| }, | |
| "features": FEATURE_NAMES, | |
| "n_features": len(FEATURE_NAMES), | |
| "training_samples": len(queries), | |
| "test_samples": len(X_test), | |
| } | |
| with open(os.path.join(OUTPUT_DIR, "metadata.json"), "w") as f: | |
| json.dump(metadata, f, indent=2) | |
| print(f"\n Models saved to {OUTPUT_DIR}/") | |
| print(" Files: cache_advisor.joblib, ttl_recommender.joblib,") | |
| print(" complexity_estimator.joblib, label_encoder.joblib,") | |
| print(" feature_importances.json, metadata.json") | |
| print("\nDone.") | |
| if __name__ == "__main__": | |
| train() | |