Classify outlier in excel file

This commit is contained in:
Laborratte 5 2025-06-22 17:29:35 +02:00
parent 5f9f4a6c7f
commit 0bf7c19f86
Signed by: Laborratte5
GPG key ID: 3A30072E35202C02

View file

@ -2,41 +2,39 @@ import numpy as np
import pandas import pandas
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline from sklearn.pipeline import make_pipeline
from sklearn import tree from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import export_text
NO_DATA: str = "NO DATA" NO_DATA: str = "NO DATA"
def read_excel(path: str, target_column: str): def read_excel(path: str):
data = pandas.read_excel(path) data = pandas.read_excel(path)
X = data.drop(target_column, axis=1) X = data
y = data[target_column] feature_names = data.columns
feature_names = data.columns.drop(target_column)
for feat in feature_names: for feat in feature_names:
if feat == target_column:
continue
if feat.startswith("Messpunkt"): if feat.startswith("Messpunkt"):
# Convert to numerical value # Convert to numerical value
X[feat] = X[feat].replace(to_replace=NO_DATA, value=np.nan) X[feat] = X[feat].replace(to_replace=NO_DATA, value=np.nan)
X[feat] = X[feat].astype('float64') X[feat] = X[feat].astype('float64')
else: else:
# Convert to categorical value # Convert to categorical value
X[feat] = X[feat].astype('string') X[feat] = X[feat].replace(to_replace=NO_DATA, value=None).astype('object')
return X, y, feature_names return X
def classify(X, y, feature_names=None): def classify(X, data_frame):
clf = tree.DecisionTreeClassifier() clf = LocalOutlierFactor(n_neighbors=3)
clf = clf.fit(X, y) y_pred = clf.fit_predict(X)
r = export_text(clf, feature_names=feature_names) #X_scores = clf.negative_outlier_factor_
print(r)
#print(y_pred)
#print(X_scores)
return data_frame[y_pred < 0]
if __name__ == '__main__': if __name__ == '__main__':
X, y, feature_names = read_excel("tests/Beispiel Auswertung2.xlsx", "Motornummer") data_frame = read_excel("tests/Beispiel Auswertung2.xlsx")
pipe = make_pipeline(OrdinalEncoder(), SimpleImputer()) X = pandas.get_dummies(data_frame) # OneHotEncode categorical values
pipe = make_pipeline(SimpleImputer(add_indicator=True))
X = pipe.fit_transform(X) X = pipe.fit_transform(X)
print(X) outlier = classify(X, data_frame)
classify(X, y, feature_names) print(outlier)