import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
    cross_val_predict
)
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    plot_confusion_matrix,
    precision_recall_curve,
    average_precision_score,
    roc_curve,
    roc_auc_score
)

from pandas.plotting import scatter_matrix
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import eli5
import shap

plt.rcParams["font.size"] = 16

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)


df = pd.read_csv('Burgers-test-1.csv')
df


df.dtypes

vertex[15] x    float64
vertex[15] y    float64
jac[4][4]       float64
jac[4][6]       float64
jac[4][13]      float64
jac[4][15]      float64
jac[4][22]      float64
jac[4][23]        int64
jac[6][4]       float64
jac[6][6]       float64
jac[6][13]      float64
jac[6][15]      float64
jac[6][22]        int64
jac[6][23]      float64
jac[13][4]      float64
jac[13][6]      float64
jac[13][13]     float64
jac[13][15]       int64
jac[13][22]     float64
jac[13][23]     float64
jac[15][4]      float64
jac[15][6]      float64
jac[15][13]     float64
jac[15][15]     float64
jac[15][22]     float64
jac[15][23]     float64
jac[22][4]      float64
jac[22][6]        int64
jac[22][13]     float64
jac[22][15]     float64
jac[22][22]     float64
jac[22][23]     float64
jac[23][4]        int64
jac[23][6]      float64
jac[23][13]     float64
jac[23][15]     float64
jac[23][22]     float64
jac[23][23]     float64
eigenvalue      float64
dtype: object


df_description = df.describe(include = 'all')
df_description


drop_list = []

for column in df.columns:
    diff = np.abs(df_description.loc['max', column] - df_description.loc['min', column])
    if diff < 1e-6:
        drop_list.append(column)

df.drop(columns = drop_list, inplace = True)


drop_list

['jac[4][15]',
 'jac[4][23]',
 'jac[6][22]',
 'jac[6][23]',
 'jac[13][6]',
 'jac[13][15]',
 'jac[22][6]',
 'jac[22][15]',
 'jac[23][4]']


df['target'] = (df['eigenvalue'] > 0).astype(int)
df.drop(columns = ['eigenvalue'], inplace = True)

df


features = df.columns
features = features.drop('target')
for feature in features:
    k = features.get_loc(feature) % 3
    if k == 0:
        fig, axes = plt.subplots(1, 3, figsize = (24, 7))
    sns.kdeplot(df[feature], shade = True, hue = df['target'], ax = axes[k])


cor = df.corr()
plt.figure(figsize = (24, 20))
sns.set()
sns.heatmap(cor, annot = True, cmap = plt.cm.PiYG);


numeric_features = ['vertex[15] x', 'vertex[15] y']


df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 0)

print(f"There are {df_train.size:g} training and {df_test.size:g} test examples.")

pos_neg_ratio = df[df['target']==1].size/df[df['target']==0].size
print(f"The ratio of positive examples to negative examples is {pos_neg_ratio:f}")

There are 932160 training and 233040 test examples.
The ratio of positive examples to negative examples is 1.696660


X_train = df_train[numeric_features]
y_train = df_train['target']
X_test = df_test[numeric_features]
y_test = df_test['target']


preprocessor = make_column_transformer(
    (
        StandardScaler(), numeric_features
    )
)
preprocessor

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['vertex[15] x', 'vertex[15] y'])])


pipeline = make_pipeline(preprocessor, DummyClassifier(strategy = 'prior'))
pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['vertex[15] x',
                                                   'vertex[15] y'])])),
                ('dummyclassifier', DummyClassifier())])


def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    score = cross_validate(model, X_train, y_train, **kwargs)

    mean = pd.DataFrame(score).mean()
    std = pd.DataFrame(score).std()

    return {'value': mean, 'std': std}

results_dict = {}  # dictionary to store all the results


results_dict['dummy'] = mean_std_cross_val_scores(pipeline, X_train, y_train,
                                                  cv = 5, return_train_score = True,
                                                  scoring = 'roc_auc', n_jobs = -1)
pd.DataFrame(results_dict['dummy'])


C_Vals = np.logspace(-2.0, 3.0, num = 41)
params = {'logisticregression__C': C_Vals}

pipeline = make_pipeline(preprocessor, LogisticRegression(max_iter = 2000,
                                                          class_weight = 'balanced'))

rscv = RandomizedSearchCV(pipeline, params, n_iter = 20, n_jobs = -1,
                          random_state = 0, scoring = "roc_auc")
rscv.fit(X_train, y_train)

print(rscv.best_params_['logisticregression__C'])

237.13737056616552


pipeline = make_pipeline(
    preprocessor,
    LogisticRegression(max_iter = 2000, 
                       C = rscv.best_params_['logisticregression__C'],
                       class_weight = 'balanced'))

results_dict['logisticregression'] = mean_std_cross_val_scores(
    pipeline, X_train, y_train, cv = 10, return_train_score = True,
    scoring = 'roc_auc', n_jobs = -1)
pd.DataFrame(results_dict['logisticregression'])


models = {
    'knn': KNeighborsClassifier(),
    'svm': SVC(class_weight = 'balanced'),
    'xgboost': xgb.XGBClassifier(verbosity = 0, use_label_encoder = False,
                                 scale_pos_weight = pos_neg_ratio),   
}

for model in models:
    pipeline = make_pipeline(preprocessor, models[model])
    results_dict[model] = mean_std_cross_val_scores(pipeline, X_train, y_train,
                                                    return_train_score = True,
                                                    n_jobs = -1, scoring = "roc_auc")


# The results for KNN
pd.DataFrame(results_dict['knn'])


# The results for SVM
pd.DataFrame(results_dict['svm'])


# The results for XGBoost
pd.DataFrame(results_dict['xgboost'])


pipeline = make_pipeline(
    preprocessor,
    LogisticRegression(max_iter = 2000,
                       C = rscv.best_params_['logisticregression__C'],
                       class_weight = 'balanced'))
pipeline.fit(X_train, y_train)

print(classification_report(y_test, pipeline.predict(X_test),
                            target_names = ['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.99      0.99      0.99      2871
    positive       1.00      0.99      1.00      4897

    accuracy                           0.99      7768
   macro avg       0.99      0.99      0.99      7768
weighted avg       0.99      0.99      0.99      7768


plot_confusion_matrix(
    pipeline,
    X_test,
    y_test,
    display_labels = ['negative', 'positive'],
    values_format = 'g',
    cmap = plt.cm.PuRd,
    colorbar = False,
);
plt.grid(False)


numeric_features = df.columns
numeric_features = numeric_features.drop(
    ['vertex[15] x', 'vertex[15] y', 'target']).tolist()
numeric_features

['jac[4][4]',
 'jac[4][6]',
 'jac[4][13]',
 'jac[4][22]',
 'jac[6][4]',
 'jac[6][6]',
 'jac[6][13]',
 'jac[6][15]',
 'jac[13][4]',
 'jac[13][13]',
 'jac[13][22]',
 'jac[13][23]',
 'jac[15][4]',
 'jac[15][6]',
 'jac[15][13]',
 'jac[15][15]',
 'jac[15][22]',
 'jac[15][23]',
 'jac[22][4]',
 'jac[22][13]',
 'jac[22][22]',
 'jac[22][23]',
 'jac[23][6]',
 'jac[23][13]',
 'jac[23][15]',
 'jac[23][22]',
 'jac[23][23]']


preprocessor = make_column_transformer(
    (
        StandardScaler(), numeric_features
    )
)
preprocessor

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['jac[4][4]', 'jac[4][6]', 'jac[4][13]',
                                  'jac[4][22]', 'jac[6][4]', 'jac[6][6]',
                                  'jac[6][13]', 'jac[6][15]', 'jac[13][4]',
                                  'jac[13][13]', 'jac[13][22]', 'jac[13][23]',
                                  'jac[15][4]', 'jac[15][6]', 'jac[15][13]',
                                  'jac[15][15]', 'jac[15][22]', 'jac[15][23]',
                                  'jac[22][4]', 'jac[22][13]', 'jac[22][22]',
                                  'jac[22][23]', 'jac[23][6]', 'jac[23][13]',
                                  'jac[23][15]', 'jac[23][22]',
                                  'jac[23][23]'])])


X_train = df_train[numeric_features]
y_train = df_train['target']
X_test = df_test[numeric_features]
y_test = df_test['target']


results_dict = {}  # dictionary to store all the results


pipeline = make_pipeline(preprocessor, LGBMClassifier(random_state = 0))

results_dict['logisticregression'] = mean_std_cross_val_scores(
    pipeline, X_train, y_train, cv = 10, return_train_score = True,
    scoring = 'roc_auc', n_jobs = -1)
pd.DataFrame(results_dict['logisticregression'])


pipeline.fit(X_train, y_train)

print(classification_report(y_test, pipeline.predict(X_test),
                            target_names = ['negative', 'positive']))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      2871
    positive       1.00      1.00      1.00      4897

    accuracy                           1.00      7768
   macro avg       1.00      1.00      1.00      7768
weighted avg       1.00      1.00      1.00      7768


disp = plot_confusion_matrix(
    pipeline,
    X_test,
    y_test,
    display_labels = ['negative', 'positive'],
    values_format = 'g',
    cmap = plt.cm.PuRd,
    colorbar = False,
);
plt.grid(False)


eli5.explain_weights(pipeline.named_steps['lgbmclassifier'],
                     feature_names = numeric_features)


shap_explainer = shap.TreeExplainer(pipeline.named_steps['lgbmclassifier'])
shap_explainer_values = shap_explainer.shap_values(X_train)


shap_explainer_values[1].shape

(31072, 27)


shap_explainer_values_test = shap_explainer.shap_values(X_test)
shap_explainer_values_test[1].shape

(7768, 27)


values = np.abs(shap_explainer_values_test[1]).mean(0)
pd.DataFrame(data = values,
             index = numeric_features,
             columns = ['SHAP values']).sort_values(by = 'SHAP values', ascending = False)


shap.initjs()
shap.dependence_plot('jac[6][13]', shap_explainer_values[1], X_train)


shap.summary_plot(shap_explainer_values[1], X_train)


ex1_id = 0
pipeline.named_steps["lgbmclassifier"].predict_proba(X_test)[ex1_id]

array([0.29239654, 0.70760346])


pipeline.named_steps["lgbmclassifier"].predict(X_test, raw_score = True)[ex1_id]

0.8837729529463807


shap_explainer.expected_value[1]

3.022102865429598


shap.force_plot(
    shap_explainer.expected_value[1],
    shap_explainer_values_test[1][ex1_id, :],
    X_test.iloc[ex1_id, :],
    matplotlib = True,
)


y_test.iloc[ex1_id]

0


ex2_id = 1000
pipeline.named_steps["lgbmclassifier"].predict_proba(X_test)[ex2_id]

array([1.6439865e-05, 9.9998356e-01])


pipeline.named_steps["lgbmclassifier"].predict(X_test, raw_score = True)[ex2_id]

11.015784937396734


shap_explainer.expected_value[1]

3.022102865429598


shap.force_plot(
    shap_explainer.expected_value[1],
    shap_explainer_values_test[1][ex2_id, :],
    X_test.iloc[ex2_id, :],
    matplotlib = True,
)


y_test.iloc[ex2_id]

1

	vertex[15] x	vertex[15] y	jac[4][4]	jac[4][6]	jac[4][13]	jac[4][15]	jac[4][22]	jac[4][23]	jac[6][4]	jac[6][6]	...	jac[22][15]	jac[22][22]	jac[22][23]	jac[23][4]	jac[23][6]	jac[23][13]	jac[23][15]	jac[23][22]	jac[23][23]	eigenvalue
0	0.000692	0.19104	-0.83566	0.282060	0.79353	0.000000e+00	-0.32935	0	0.91463	-0.91617	...	-2.569600e-16	-55.166	4.9628	0	0.000000e+00	21.89600	-15.430	52.0310	-69.34400	-0.28547
1	0.000692	0.19164	-0.83187	0.279730	0.79228	0.000000e+00	-0.32853	0	0.91371	-0.91521	...	-2.492500e-16	-55.214	4.7694	0	0.000000e+00	21.77300	-15.505	52.2790	-69.06300	-0.28658
2	0.000692	0.19711	-0.79771	0.258710	0.78106	2.525900e-17	-0.32113	0	0.90545	-0.90654	...	1.813700e-16	-55.645	3.0297	0	0.000000e+00	20.66700	-16.179	54.5080	-66.53000	-0.29703
3	0.000692	0.25182	-0.45521	0.048438	0.66865	-2.408400e-17	-0.24698	0	0.82260	-0.81961	...	4.093400e-16	-59.961	-14.3680	0	0.000000e+00	9.60790	-22.916	76.8040	-41.20300	-0.47191
4	0.000692	0.25243	-0.45140	0.046101	0.66740	0.000000e+00	-0.24615	0	0.82167	-0.81864	...	0.000000e+00	-60.009	-14.5610	0	-5.683100e-16	9.48500	-22.991	77.0520	-40.92200	-0.47545
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
38835	0.114560	0.24997	131.76000	-92.123000	-56.70800	0.000000e+00	23.56700	0	-132.91000	133.43000	...	0.000000e+00	-13.274	3.1578	0	-1.503200e+01	-0.15607	24.314	3.1713	0.30762	245.82000
38836	0.114560	0.24997	131.76000	-92.128000	-56.71000	0.000000e+00	23.56800	0	-132.91000	133.43000	...	0.000000e+00	-13.275	3.1569	0	-1.503300e+01	-0.15608	24.314	3.1713	0.30711	245.82000
38837	0.114560	0.24998	131.77000	-92.133000	-56.71300	-4.301600e-15	23.57000	0	-132.91000	133.43000	...	0.000000e+00	-13.275	3.1560	0	-1.503400e+01	-0.15608	24.315	3.1713	0.30660	245.83000
38838	0.114560	0.24999	131.78000	-92.137000	-56.71500	0.000000e+00	23.57200	0	-132.92000	133.44000	...	0.000000e+00	-13.276	3.1550	0	-1.503500e+01	-0.15609	24.315	3.1713	0.30608	245.84000
38839	0.114560	0.25000	131.79000	-92.142000	-56.71800	4.301800e-15	23.57300	0	-132.92000	133.44000	...	7.870000e-18	-13.276	3.1541	0	-1.503600e+01	-0.15609	24.315	3.1713	0.30557	245.85000

	vertex[15] x	vertex[15] y	jac[4][4]	jac[4][6]	jac[4][13]	jac[4][15]	jac[4][22]	jac[4][23]	jac[6][4]	jac[6][6]	...	jac[22][15]	jac[22][22]	jac[22][23]	jac[23][4]	jac[23][6]	jac[23][13]	jac[23][15]	jac[23][22]	jac[23][23]	eigenvalue
count	38840.000000	38840.000000	38840.000000	38840.000000	38840.00000	3.884000e+04	38840.000000	38840.0	38840.000000	38840.000000	...	3.884000e+04	38840.000000	38840.000000	38840.0	3.884000e+04	38840.000000	38840.000000	38840.000000	38840.000000	38840.000000
mean	0.064300	0.250411	39.575594	-35.072945	-4.20193	1.877330e-18	2.581592	0.0	-30.191792	30.482211	...	-1.352026e-18	-25.538514	-5.739185	0.0	-3.218755e+00	-3.771587	3.177027	32.971334	-16.375711	71.399543
std	0.032873	0.032130	46.230474	30.400564	22.77777	9.799371e-16	9.623131	0.0	46.240072	46.394341	...	2.257896e-16	13.090931	8.201346	0.0	4.513274e+00	5.792621	12.491575	22.215507	13.966263	80.653834
min	0.000692	0.177440	-26.001000	-92.601000	-56.96600	-4.310300e-15	-12.548000	0.0	-133.100000	-22.156000	...	-1.327400e-15	-64.708000	-33.504000	0.0	-1.513100e+01	-14.586000	-30.325000	3.170700	-69.344000	-5.512400
25%	0.035998	0.228470	-2.094550	-62.168000	-20.94300	0.000000e+00	-4.916000	0.0	-66.173250	-11.443250	...	0.000000e+00	-33.719000	-10.558500	0.0	-5.890350e+00	-7.318025	-5.517500	14.689500	-23.231750	-5.512400
50%	0.069453	0.250160	28.664500	-30.483500	3.76805	0.000000e+00	-0.644690	0.0	-15.848500	16.157500	...	0.000000e+00	-19.608000	-3.296100	0.0	-1.355200e-15	-4.635550	2.221500	27.358500	-11.200500	46.089500
75%	0.092797	0.272340	79.616250	-5.369400	14.02125	0.000000e+00	10.180000	0.0	11.587000	66.618000	...	0.000000e+00	-15.096500	0.984130	0.0	0.000000e+00	-1.504650	13.530000	48.653250	-6.554850	136.812500
max	0.114560	0.322560	132.530000	6.370400	27.18000	8.513100e-15	23.737000	0.0	22.044000	133.630000	...	1.351300e-15	-13.225000	4.962800	0.0	8.868300e-02	21.896000	24.347000	101.330000	0.356300	246.610000

	vertex[15] x	vertex[15] y	jac[4][4]	jac[4][6]	jac[4][13]	jac[4][22]	jac[6][4]	jac[6][6]	jac[6][13]	jac[6][15]	...	jac[22][4]	jac[22][13]	jac[22][22]	jac[22][23]	jac[23][6]	jac[23][13]	jac[23][15]	jac[23][22]	jac[23][23]	target
0	0.000692	0.19104	-0.83566	0.282060	0.79353	-0.32935	0.91463	-0.91617	-0.31414	0.29069	...	-5.208200e+00	13.6720	-55.166	4.9628	0.000000e+00	21.89600	-15.430	52.0310	-69.34400	0
1	0.000692	0.19164	-0.83187	0.279730	0.79228	-0.32853	0.91371	-0.91521	-0.31274	0.28992	...	-5.096900e+00	13.7410	-55.214	4.7694	0.000000e+00	21.77300	-15.505	52.2790	-69.06300	0
2	0.000692	0.19711	-0.79771	0.258710	0.78106	-0.32113	0.90545	-0.90654	-0.30017	0.28302	...	-4.095300e+00	14.3590	-55.645	3.0297	0.000000e+00	20.66700	-16.179	54.5080	-66.53000	0
3	0.000692	0.25182	-0.45521	0.048438	0.66865	-0.24698	0.82260	-0.81961	-0.17430	0.21383	...	5.921200e+00	20.5420	-59.961	-14.3680	0.000000e+00	9.60790	-22.916	76.8040	-41.20300	0
4	0.000692	0.25243	-0.45140	0.046101	0.66740	-0.24615	0.82167	-0.81864	-0.17290	0.21306	...	6.032500e+00	20.6110	-60.009	-14.5610	-5.683100e-16	9.48500	-22.991	77.0520	-40.92200	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
38835	0.114560	0.24997	131.76000	-92.123000	-56.70800	23.56700	-132.91000	133.43000	93.34900	-86.80100	...	0.000000e+00	1.8619	-13.274	3.1578	-1.503200e+01	-0.15607	24.314	3.1713	0.30762	1
38836	0.114560	0.24997	131.76000	-92.128000	-56.71000	23.56800	-132.91000	133.43000	93.35200	-86.80300	...	1.515500e-15	1.8625	-13.275	3.1569	-1.503300e+01	-0.15608	24.314	3.1713	0.30711	1
38837	0.114560	0.24998	131.77000	-92.133000	-56.71300	23.57000	-132.91000	133.43000	93.35500	-86.80400	...	1.515500e-15	1.8630	-13.275	3.1560	-1.503400e+01	-0.15608	24.315	3.1713	0.30660	1
38838	0.114560	0.24999	131.78000	-92.137000	-56.71500	23.57200	-132.92000	133.44000	93.35800	-86.80600	...	1.515600e-15	1.8635	-13.276	3.1550	-1.503500e+01	-0.15609	24.315	3.1713	0.30608	1
38839	0.114560	0.25000	131.79000	-92.142000	-56.71800	23.57300	-132.92000	133.44000	93.36000	-86.80700	...	0.000000e+00	1.8641	-13.276	3.1541	-1.503600e+01	-0.15609	24.315	3.1713	0.30557	1

	value	std
fit_time	0.021879	0.008560
score_time	0.008157	0.007852
test_score	0.500000	0.000000
train_score	0.500000	0.000000

	value	std
fit_time	0.113733	0.015204
score_time	0.005493	0.006426
test_score	0.999884	0.000031
train_score	0.999883	0.000004

Machine Learning for Eigenanalysis¶

The numerical mesh¶

Data Analysis¶

Problem Definition¶

Contour plot of eigenvalue vs mesh coordinates¶

Data Splitting and Target Extraction¶

Defining a New Problem¶

Study Feature Importance¶

	value	std
fit_time	0.035298	3.888137e-03
score_time	0.050078	9.103614e-03
test_score	0.999981	4.932859e-06
train_score	0.999996	6.668202e-07

	value	std
fit_time	2.729952	1.324859e-01
score_time	1.511304	1.074206e-01
test_score	0.999998	2.232547e-06
train_score	0.999998	9.477086e-07

	value	std
fit_time	2.002868	6.702143e-02
score_time	0.034040	7.459798e-03
test_score	0.999973	1.389701e-05
train_score	1.000000	7.850462e-17

	value	std
fit_time	2.414794	9.829064e-01
score_time	0.032567	1.393451e-02
test_score	0.999997	3.152882e-06
train_score	1.000000	5.233642e-17

Weight	Feature
0.9596	jac[6][13]
0.0151	jac[4][6]
0.0100	jac[15][15]
0.0082	jac[6][15]
0.0023	jac[6][6]
0.0021	jac[6][4]
0.0014	jac[4][4]
0.0003	jac[22][4]
0.0002	jac[15][6]
0.0002	jac[4][13]
0.0001	jac[4][22]
0.0001	jac[13][4]
0.0001	jac[15][23]
0.0001	jac[23][23]
0.0001	jac[15][22]
0.0000	jac[13][13]
0.0000	jac[15][4]
0.0000	jac[22][13]
0.0000	jac[23][13]
0.0000	jac[15][13]
… 7 more …

	SHAP values
jac[6][13]	4.545563e+00
jac[15][15]	3.949945e+00
jac[6][15]	1.396141e+00
jac[4][6]	1.038393e+00
jac[22][4]	5.268402e-01
jac[6][4]	4.235165e-01
jac[15][6]	3.677990e-01
jac[4][4]	2.213640e-01
jac[6][6]	1.527950e-01
jac[15][4]	2.970371e-02
jac[23][23]	2.081470e-02
jac[4][22]	1.711044e-02
jac[13][4]	1.701067e-02
jac[15][23]	1.579014e-02
jac[4][13]	1.380722e-02
jac[23][13]	1.188273e-02
jac[15][13]	5.507729e-03
jac[13][13]	4.731218e-03
jac[13][22]	2.703677e-03
jac[15][22]	2.531152e-03
jac[23][15]	8.954353e-04
jac[22][22]	7.199953e-04
jac[22][13]	7.131488e-04
jac[22][23]	4.571032e-04
jac[23][22]	1.535875e-04
jac[13][23]	1.050718e-18
jac[23][6]	0.000000e+00