From 0bf7c19f86fd45ffc75df97b246f89f5924e2aa1 Mon Sep 17 00:00:00 2001 From: Laborratte5 Date: Sun, 22 Jun 2025 17:29:35 +0200 Subject: [PATCH] Classify outlier in excel file --- src/cluster_test/cluster.py | 40 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/src/cluster_test/cluster.py b/src/cluster_test/cluster.py index a74263b..1973f47 100644 --- a/src/cluster_test/cluster.py +++ b/src/cluster_test/cluster.py @@ -2,41 +2,39 @@ import numpy as np import pandas from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline -from sklearn import tree -from sklearn.preprocessing import OrdinalEncoder -from sklearn.tree import export_text +from sklearn.neighbors import LocalOutlierFactor NO_DATA: str = "NO DATA" -def read_excel(path: str, target_column: str): +def read_excel(path: str): data = pandas.read_excel(path) - X = data.drop(target_column, axis=1) - y = data[target_column] - feature_names = data.columns.drop(target_column) + X = data + feature_names = data.columns for feat in feature_names: - if feat == target_column: - continue - if feat.startswith("Messpunkt"): # Convert to numerical value X[feat] = X[feat].replace(to_replace=NO_DATA, value=np.nan) X[feat] = X[feat].astype('float64') else: # Convert to categorical value - X[feat] = X[feat].astype('string') + X[feat] = X[feat].replace(to_replace=NO_DATA, value=None).astype('object') - return X, y, feature_names + return X -def classify(X, y, feature_names=None): - clf = tree.DecisionTreeClassifier() - clf = clf.fit(X, y) - r = export_text(clf, feature_names=feature_names) - print(r) +def classify(X, data_frame): + clf = LocalOutlierFactor(n_neighbors=3) + y_pred = clf.fit_predict(X) + #X_scores = clf.negative_outlier_factor_ + + #print(y_pred) + #print(X_scores) + return data_frame[y_pred < 0] if __name__ == '__main__': - X, y, feature_names = read_excel("tests/Beispiel Auswertung2.xlsx", "Motornummer") - pipe = make_pipeline(OrdinalEncoder(), SimpleImputer()) + data_frame = read_excel("tests/Beispiel Auswertung2.xlsx") + X = pandas.get_dummies(data_frame) # OneHotEncode categorical values + pipe = make_pipeline(SimpleImputer(add_indicator=True)) X = pipe.fit_transform(X) - print(X) - classify(X, y, feature_names) + outlier = classify(X, data_frame) + print(outlier)