From aabcd6a80aec9d0836589b287902759ac8dc6675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=A2=E5=8F=AF=E6=98=93?= <13816688325@163.com> Date: Mon, 12 Jan 2026 16:21:42 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=95=B0=E6=8D=AE=E6=B8=85=E6=B4=97?= =?UTF-8?q?=E6=B5=81=E6=B0=B4=E7=BA=BF=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 5 +++++ src/__init__.py | 1 + src/data_pipeline.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) create mode 100644 .gitignore create mode 100644 src/__init__.py create mode 100644 src/data_pipeline.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..36fbda3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.env +.venv/ +__pycache__/ +.ipynb_checkpoints/ +.DS_Store diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..a9a2c5b --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +__all__ = [] diff --git a/src/data_pipeline.py b/src/data_pipeline.py new file mode 100644 index 0000000..7399662 --- /dev/null +++ b/src/data_pipeline.py @@ -0,0 +1,31 @@ +import pandas as pd +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline + +class CleanTransformer(BaseEstimator, TransformerMixin): + def __init__(self): + pass + def fit(self, X, y=None): + return self + def transform(self, X): + X = X.drop_duplicates() + return X + +def build_preprocess(columns, target): + num_cols = [c for c in columns if c != target] + numeric = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]) + ct = ColumnTransformer([("num", numeric, num_cols)], remainder="drop") + return ct, num_cols + +def load_data(path): + df = pd.read_csv(path) + return df + +def split_Xy(df, target): + X = df.drop(columns=[target]) + y = df[target].astype(int) + return X, y