Transform data and fit desicionTree

2025-06-06 14:44:18 +02:00 · 2025-06-06 14:44:18 +02:00 · 5f9f4a6c7f
commit 5f9f4a6c7f
parent e232b49424
1 changed files with 37 additions and 3 deletions
--- a/src/cluster_test/cluster.py
+++ b/src/cluster_test/cluster.py
@ -1,8 +1,42 @@
+import numpy as np
 import pandas
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn import tree
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.tree import export_text

-def read_excel(path: str):
+NO_DATA: str = "NO DATA"
+
+def read_excel(path: str, target_column: str):
    data = pandas.read_excel(path)
-    print(data)
+    X = data.drop(target_column, axis=1)
+    y = data[target_column]
+    feature_names = data.columns.drop(target_column)
+
+    for feat in feature_names:
+        if feat == target_column:
+            continue
+
+        if feat.startswith("Messpunkt"):
+            # Convert to numerical value
+            X[feat] = X[feat].replace(to_replace=NO_DATA, value=np.nan)
+            X[feat] = X[feat].astype('float64')
+        else:
+            # Convert to categorical value
+            X[feat] = X[feat].astype('string')
+
+    return X, y, feature_names
+
+def classify(X, y, feature_names=None):
+    clf = tree.DecisionTreeClassifier()
+    clf = clf.fit(X, y)
+    r = export_text(clf, feature_names=feature_names)
+    print(r)

 if __name__ == '__main__':
-    read_excel("tests/Beispiel Auswertung.xlsx")
+    X, y, feature_names = read_excel("tests/Beispiel Auswertung2.xlsx", "Motornummer")
+    pipe = make_pipeline(OrdinalEncoder(), SimpleImputer())
+    X = pipe.fit_transform(X)
+    print(X)
+    classify(X, y, feature_names)