In the following, we predict the benchmark instance labels provided by GBD’s meta.db from instance feature records provided by GBD’s base.db.
from gbd_core.api import GBD
import pandas as pd
def get_available_features():
with GBD([ 'data/base.db' ]) as gbd:
return gbd.get_features('base')
def get_prediction_dataset(features, target):
with GBD([ 'data/base.db', 'data/meta.db' ]) as gbd:
df = gbd.query('base_features_runtime != memout', resolve=features+[target])
df[features] = df[features].apply(pd.to_numeric)
return df
print(get_available_features())
['base_features_runtime', 'clauses', 'variables', 'cls1', 'cls2', 'cls3', 'cls4', 'cls5', 'cls6', 'cls7', 'cls8', 'cls9', 'cls10p', 'horn', 'invhorn', 'positive', 'negative', 'hornvars_mean', 'hornvars_variance', 'hornvars_min', 'hornvars_max', 'hornvars_entropy', 'invhornvars_mean', 'invhornvars_variance', 'invhornvars_min', 'invhornvars_max', 'invhornvars_entropy', 'balancecls_mean', 'balancecls_variance', 'balancecls_min', 'balancecls_max', 'balancecls_entropy', 'balancevars_mean', 'balancevars_variance', 'balancevars_min', 'balancevars_max', 'balancevars_entropy', 'vcg_vdegree_mean', 'vcg_vdegree_variance', 'vcg_vdegree_min', 'vcg_vdegree_max', 'vcg_vdegree_entropy', 'vcg_cdegree_mean', 'vcg_cdegree_variance', 'vcg_cdegree_min', 'vcg_cdegree_max', 'vcg_cdegree_entropy', 'vg_degree_mean', 'vg_degree_variance', 'vg_degree_min', 'vg_degree_max', 'vg_degree_entropy', 'cg_degree_mean', 'cg_degree_variance', 'cg_degree_min', 'cg_degree_max', 'cg_degree_entropy']
Train instance category predictor once and report its accuracy.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
feat = get_available_features()
data = get_prediction_dataset(feat, 'family')
X_train, X_test, y_train, y_test = train_test_split(data[feat], data['family'], test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
Accuracy: 0.969248670212766
Extract feature importance from category predictor.
def print_feature_importances(model, feat, granularity=50):
feature_importances = model.feature_importances_
maximum = int(max(feature_importances) * granularity)+1
byimp = [ [] for _ in range(maximum) ]
for i, imp in enumerate(feature_importances):
byimp[int(imp*granularity)].append((feat[i]))
fimp = [ ((i/granularity, (i+1)/granularity), byimp[i]) for i in range(maximum-1, -1, -1) if len(byimp[i]) > 0 ]
print("Feature importances:", fimp)
print_feature_importances(model, feat)
Feature importances: [((0.04, 0.06), ['hornvars_variance', 'hornvars_entropy', 'balancecls_mean', 'balancecls_variance', 'balancevars_mean', 'vcg_cdegree_mean']), ((0.02, 0.04), ['hornvars_mean', 'hornvars_max', 'invhornvars_mean', 'balancevars_variance', 'balancevars_entropy', 'vcg_vdegree_mean', 'vcg_cdegree_variance', 'vcg_cdegree_entropy', 'vg_degree_mean', 'vg_degree_entropy', 'cg_degree_min']), ((0.0, 0.02), ['base_features_runtime', 'clauses', 'variables', 'cls1', 'cls2', 'cls3', 'cls4', 'cls5', 'cls6', 'cls7', 'cls8', 'cls9', 'cls10p', 'horn', 'invhorn', 'positive', 'negative', 'hornvars_min', 'invhornvars_variance', 'invhornvars_min', 'invhornvars_max', 'invhornvars_entropy', 'balancecls_min', 'balancecls_max', 'balancecls_entropy', 'balancevars_min', 'balancevars_max', 'vcg_vdegree_variance', 'vcg_vdegree_min', 'vcg_vdegree_max', 'vcg_vdegree_entropy', 'vcg_cdegree_min', 'vcg_cdegree_max', 'vg_degree_variance', 'vg_degree_min', 'vg_degree_max', 'cg_degree_mean', 'cg_degree_variance', 'cg_degree_max', 'cg_degree_entropy'])]
Train the category prediction model on subsets of the base features and evaluate feature importance for them.
subsets = {
'Clause Counts': ['clauses', 'variables', 'cls1', 'cls2', 'cls3', 'cls4', 'cls5', 'cls6', 'cls7', 'cls8', 'cls9', 'cls10p', 'horn', 'invhorn', 'positive', 'negative'],
'Horn Variables': ['hornvars_mean', 'hornvars_variance', 'hornvars_min', 'hornvars_max', 'hornvars_entropy', 'invhornvars_mean', 'invhornvars_variance', 'invhornvars_min', 'invhornvars_max', 'invhornvars_entropy'],
'Polarity Balance': ['balancecls_mean', 'balancecls_variance', 'balancecls_min', 'balancecls_max', 'balancecls_entropy', 'balancevars_mean', 'balancevars_variance', 'balancevars_min', 'balancevars_max', 'balancevars_entropy'],
'Bipartite Graph': ['vcg_vdegree_mean', 'vcg_vdegree_variance', 'vcg_vdegree_min', 'vcg_vdegree_max', 'vcg_vdegree_entropy', 'vcg_cdegree_mean', 'vcg_cdegree_variance', 'vcg_cdegree_min', 'vcg_cdegree_max', 'vcg_cdegree_entropy'],
'Variable Graph': ['vg_degree_mean', 'vg_degree_variance', 'vg_degree_min', 'vg_degree_max', 'vg_degree_entropy'],
'Clause Graph': ['cg_degree_mean', 'cg_degree_variance', 'cg_degree_min', 'cg_degree_max', 'cg_degree_entropy']
}
for name, sub in subsets.items():
submodel = RandomForestClassifier()
submodel.fit(X_train[sub], y_train)
y_pred_sub = submodel.predict(X_test[sub])
accuracy_feature = accuracy_score(y_test, y_pred_sub)
print(f"Accuracy with {name} features:", accuracy_feature)
print_feature_importances(submodel, sub)
Accuracy with Clause Counts features: 0.9498005319148937
Feature importances: [((0.08, 0.1), ['cls2', 'cls3', 'cls10p', 'negative']), ((0.06, 0.08), ['variables', 'cls1', 'horn', 'positive']), ((0.04, 0.06), ['clauses', 'cls4', 'cls8', 'cls9', 'invhorn']), ((0.02, 0.04), ['cls5', 'cls6', 'cls7'])]
Accuracy with Horn Variables features: 0.937998670212766
Feature importances: [((0.18, 0.2), ['hornvars_mean']), ((0.16, 0.18), ['hornvars_variance']), ((0.14, 0.16), ['hornvars_entropy']), ((0.12, 0.14), ['invhornvars_mean']), ((0.1, 0.12), ['hornvars_max']), ((0.08, 0.1), ['invhornvars_variance', 'invhornvars_entropy']), ((0.06, 0.08), ['invhornvars_max']), ((0.0, 0.02), ['hornvars_min', 'invhornvars_min'])]
Accuracy with Polarity Balance features: 0.9398271276595744
Feature importances: [((0.18, 0.2), ['balancevars_mean']), ((0.14, 0.16), ['balancecls_mean']), ((0.12, 0.14), ['balancecls_variance', 'balancevars_variance']), ((0.1, 0.12), ['balancevars_entropy']), ((0.08, 0.1), ['balancecls_entropy', 'balancevars_min']), ((0.06, 0.08), ['balancecls_max']), ((0.04, 0.06), ['balancevars_max']), ((0.0, 0.02), ['balancecls_min'])]
Accuracy with Bipartite Graph features: 0.9571143617021277
Feature importances: [((0.2, 0.22), ['vcg_cdegree_mean']), ((0.14, 0.16), ['vcg_cdegree_variance']), ((0.12, 0.14), ['vcg_vdegree_mean']), ((0.1, 0.12), ['vcg_vdegree_entropy', 'vcg_cdegree_max', 'vcg_cdegree_entropy']), ((0.06, 0.08), ['vcg_vdegree_variance', 'vcg_vdegree_max']), ((0.04, 0.06), ['vcg_cdegree_min']), ((0.0, 0.02), ['vcg_vdegree_min'])]
Accuracy with Variable Graph features: 0.9336768617021277
Feature importances: [((0.32, 0.34), ['vg_degree_mean']), ((0.24, 0.26), ['vg_degree_entropy']), ((0.2, 0.22), ['vg_degree_variance', 'vg_degree_max']), ((0.0, 0.02), ['vg_degree_min'])]
Accuracy with Clause Graph features: 0.9378324468085106
Feature importances: [((0.22, 0.24), ['cg_degree_max']), ((0.18, 0.2), ['cg_degree_mean', 'cg_degree_variance', 'cg_degree_min', 'cg_degree_entropy'])]