From 703cbd1ea79ea500ea03383a0fbe98696acdd21e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=97=E5=98=89=E7=83=A8?= Date: Fri, 16 Jan 2026 19:34:00 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20data.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data.py | 92 --------------------------------------------------------- 1 file changed, 92 deletions(-) delete mode 100644 data.py diff --git a/data.py b/data.py deleted file mode 100644 index 067d120..0000000 --- a/data.py +++ /dev/null @@ -1,92 +0,0 @@ -import polars as pl -import pandera as pa -from pandera import Column, Check, DataFrameSchema -import logging - -# 配置日志 -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -# ========================================== -# 1. 定义 Pandera Schema (数据契约) -# ========================================== -# 原始数据 Schema -raw_schema = DataFrameSchema({ - "age": Column(int, checks=Check.ge(18)), - "job": Column(str), - "marital": Column(str), - "education": Column(str), - "default": Column(str, checks=Check.isin(["yes", "no"])), - "balance": Column(int), - "housing": Column(str, checks=Check.isin(["yes", "no"])), - "loan": Column(str, checks=Check.isin(["yes", "no"])), - "contact": Column(str), - "day": Column(int, checks=[Check.ge(1), Check.le(31)]), - "month": Column(str), - "duration": Column(int, checks=Check.ge(0)), - "campaign": Column(int, checks=Check.ge(1)), - "pdays": Column(int), - "previous": Column(int, checks=Check.ge(0)), - "poutcome": Column(str), - "deposit": Column(str, checks=Check.isin(["yes", "no"])), -}) - -# 清洗后 Schema -processed_schema = DataFrameSchema({ - "age": Column(int), - "balance": Column(int), - "deposit": Column(int, checks=Check.isin([0, 1])), - # 其他数值化或保留的特征... -}) - -def load_and_clean_data(file_path: str): - """ - 使用 Polars 加载并清洗数据 - """ - logger.info(f"正在加载数据: {file_path}") - - # 1. Lazy Load - lf = pl.scan_csv(file_path) - - # 2. 初步清洗计划 - # - 移除 duration (避免数据泄露) - # - 将 deposit (yes/no) 转换为 (1/0) - # - 简单的分类变量编码 (为了 LightGBM,我们可以保留分类类型或做 Label Encoding) - # LightGBM 原生支持 Category,但 sklearn 需要数值。 - # 为了通用性,这里做 Label Encoding 或者 One-Hot。 - # 但 Polars 的 Label Encoding 比较手动。 - # 我们这里先只做核心转换。 - - processed_lf = ( - lf.drop(["duration"]) # 移除泄露特征 - .with_columns([ - pl.col("deposit").replace({"yes": 1, "no": 0}).cast(pl.Int64).alias("target"), - # 简单的特征工程示例:将 pdays -1 处理为 999 或单独一类 (这里保持原样,树模型能处理) - ]) - .drop("deposit") # 移除原始标签列,保留 target - ) - - # 3. 执行计算 (Collect) - df = processed_lf.collect() - - logger.info(f"数据加载完成,形状: {df.shape}") - - # 4. Pandera 验证 (转换回 Pandas 验证,因为 Pandera 对 Polars 支持尚在实验阶段或部分支持) - # 这里我们验证关键字段 - try: - # 简单验证一下 target 是否只有 0 和 1 - assert df["target"].n_unique() <= 2 - logger.info("基础数据验证通过") - except Exception as e: - logger.error(f"数据验证失败: {e}") - raise e - - return df - -if __name__ == "__main__": - # 测试代码 - try: - df = load_and_clean_data("data/bank.csv") - print(df.head()) - except Exception as e: - print(f"Error: {e}")