Classify outlier in excel file
This commit is contained in:
parent
5f9f4a6c7f
commit
0bf7c19f86
1 changed files with 19 additions and 21 deletions
|
|
@ -2,41 +2,39 @@ import numpy as np
|
||||||
import pandas
|
import pandas
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
from sklearn.pipeline import make_pipeline
|
from sklearn.pipeline import make_pipeline
|
||||||
from sklearn import tree
|
from sklearn.neighbors import LocalOutlierFactor
|
||||||
from sklearn.preprocessing import OrdinalEncoder
|
|
||||||
from sklearn.tree import export_text
|
|
||||||
|
|
||||||
NO_DATA: str = "NO DATA"
|
NO_DATA: str = "NO DATA"
|
||||||
|
|
||||||
def read_excel(path: str, target_column: str):
|
def read_excel(path: str):
|
||||||
data = pandas.read_excel(path)
|
data = pandas.read_excel(path)
|
||||||
X = data.drop(target_column, axis=1)
|
X = data
|
||||||
y = data[target_column]
|
feature_names = data.columns
|
||||||
feature_names = data.columns.drop(target_column)
|
|
||||||
|
|
||||||
for feat in feature_names:
|
for feat in feature_names:
|
||||||
if feat == target_column:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if feat.startswith("Messpunkt"):
|
if feat.startswith("Messpunkt"):
|
||||||
# Convert to numerical value
|
# Convert to numerical value
|
||||||
X[feat] = X[feat].replace(to_replace=NO_DATA, value=np.nan)
|
X[feat] = X[feat].replace(to_replace=NO_DATA, value=np.nan)
|
||||||
X[feat] = X[feat].astype('float64')
|
X[feat] = X[feat].astype('float64')
|
||||||
else:
|
else:
|
||||||
# Convert to categorical value
|
# Convert to categorical value
|
||||||
X[feat] = X[feat].astype('string')
|
X[feat] = X[feat].replace(to_replace=NO_DATA, value=None).astype('object')
|
||||||
|
|
||||||
return X, y, feature_names
|
return X
|
||||||
|
|
||||||
def classify(X, y, feature_names=None):
|
def classify(X, data_frame):
|
||||||
clf = tree.DecisionTreeClassifier()
|
clf = LocalOutlierFactor(n_neighbors=3)
|
||||||
clf = clf.fit(X, y)
|
y_pred = clf.fit_predict(X)
|
||||||
r = export_text(clf, feature_names=feature_names)
|
#X_scores = clf.negative_outlier_factor_
|
||||||
print(r)
|
|
||||||
|
#print(y_pred)
|
||||||
|
#print(X_scores)
|
||||||
|
return data_frame[y_pred < 0]
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
X, y, feature_names = read_excel("tests/Beispiel Auswertung2.xlsx", "Motornummer")
|
data_frame = read_excel("tests/Beispiel Auswertung2.xlsx")
|
||||||
pipe = make_pipeline(OrdinalEncoder(), SimpleImputer())
|
X = pandas.get_dummies(data_frame) # OneHotEncode categorical values
|
||||||
|
pipe = make_pipeline(SimpleImputer(add_indicator=True))
|
||||||
X = pipe.fit_transform(X)
|
X = pipe.fit_transform(X)
|
||||||
print(X)
|
outlier = classify(X, data_frame)
|
||||||
classify(X, y, feature_names)
|
print(outlier)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue