news 2026/6/12 19:45:56

DAY36 复习日

作者头像

张小明

前端开发工程师

1.2k 24
文章封面图
DAY36 复习日

我们使用了神经网络的方式,用了pytorch重新对信贷数据集进行处理。

import pandas as pd import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import os # 设置随机种子以保证结果可复现 torch.manual_seed(42) np.random.seed(42) # --- 1. 数据预处理 --- def load_and_preprocess_data(filepath): print("Loading data...") df = pd.read_csv(filepath) # 删除 Id 列 if 'Id' in df.columns: df = df.drop('Id', axis=1) print(f"Original shape: {df.shape}") # 处理 'Current Loan Amount' 异常值 (99999999.0 通常表示无限制或错误) # 替换为 NaN 然后进行插补,或者替换为最大有效值。 # 这里我们替换为 NaN 并使用中位数插补,如果数量太多也可以直接删除。 # 先检查一下数量。 outlier_mask = df['Current Loan Amount'] == 99999999.0 df.loc[outlier_mask, 'Current Loan Amount'] = np.nan # 解析 'Years in current job' # 映射关系: '< 1 year'->0, '1 year'->1, ..., '10+ years'->10 def parse_years(x): if pd.isna(x): return np.nan if '<' in x: return 0 if '+' in x: return 10 return int(x.split()[0]) df['Years in current job'] = df['Years in current job'].apply(parse_years) # 插补缺失值 # 数值列 num_cols = df.select_dtypes(include=[np.number]).columns for col in num_cols: if col != 'Credit Default': # 使用中位数以增强鲁棒性 median_val = df[col].median() df[col].fillna(median_val, inplace=True) # 类别列 cat_cols = df.select_dtypes(include=['object']).columns for col in cat_cols: mode_val = df[col].mode()[0] df[col].fillna(mode_val, inplace=True) # 编码类别变量 (One-Hot 编码) df = pd.get_dummies(df, columns=cat_cols, drop_first=True) print(f"Processed shape: {df.shape}") # 划分数据 X = df.drop('Credit Default', axis=1).values y = df['Credit Default'].values # 70% 训练集, 15% 验证集, 15% 测试集 # 第一次划分: 训练集 (70%) 和 临时集 (30%) X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # 第二次划分: 验证集 (总量的 15% -> 临时集的 50%) 和 测试集 (总量的 15% -> 临时集的 50%) X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp) # 标准化 scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_val = scaler.transform(X_val) X_test = scaler.transform(X_test) return X_train, y_train, X_val, y_val, X_test, y_test, df.drop('Credit Default', axis=1).columns # --- 2. PyTorch 数据集 --- class CreditDataset(Dataset): def __init__(self, X, y): self.X = torch.FloatTensor(X) self.y = torch.FloatTensor(y).unsqueeze(1) # 二分类需要 (N, 1) 的形状 def __len__(self): return len(self.X) def __getitem__(self, idx): return self.X[idx], self.y[idx] # --- 3. 神经网络模型 --- class CreditNN(nn.Module): def __init__(self, input_dim): super(CreditNN, self).__init__() # 3 个隐藏层: 128 -> 64 -> 32 self.layer1 = nn.Linear(input_dim, 128) self.relu1 = nn.ReLU() self.dropout1 = nn.Dropout(0.3) self.layer2 = nn.Linear(128, 64) self.relu2 = nn.ReLU() self.dropout2 = nn.Dropout(0.3) self.layer3 = nn.Linear(64, 32) self.relu3 = nn.ReLU() self.output = nn.Linear(32, 1) self.sigmoid = nn.Sigmoid() def forward(self, x): x = self.dropout1(self.relu1(self.layer1(x))) x = self.dropout2(self.relu2(self.layer2(x))) x = self.relu3(self.layer3(x)) x = self.sigmoid(self.output(x)) return x # --- 4. 训练函数 --- def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100, patience=10): train_losses = [] val_losses = [] train_accs = [] val_accs = [] best_val_loss = float('inf') epochs_no_improve = 0 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print(f"Training on {device}") for epoch in range(num_epochs): model.train() running_loss = 0.0 correct_train = 0 total_train = 0 for X_batch, y_batch in train_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) optimizer.zero_grad() outputs = model(X_batch) loss = criterion(outputs, y_batch) loss.backward() optimizer.step() running_loss += loss.item() predicted = (outputs > 0.5).float() total_train += y_batch.size(0) correct_train += (predicted == y_batch).sum().item() epoch_train_loss = running_loss / len(train_loader) epoch_train_acc = correct_train / total_train train_losses.append(epoch_train_loss) train_accs.append(epoch_train_acc) # 验证 model.eval() running_val_loss = 0.0 correct_val = 0 total_val = 0 with torch.no_grad(): for X_batch, y_batch in val_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) outputs = model(X_batch) loss = criterion(outputs, y_batch) running_val_loss += loss.item() predicted = (outputs > 0.5).float() total_val += y_batch.size(0) correct_val += (predicted == y_batch).sum().item() epoch_val_loss = running_val_loss / len(val_loader) epoch_val_acc = correct_val / total_val val_losses.append(epoch_val_loss) val_accs.append(epoch_val_acc) print(f"Epoch [{epoch+1}/{num_epochs}] " f"Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f} | " f"Val Loss: {epoch_val_loss:.4f} Acc: {epoch_val_acc:.4f}") # 早停检查 if epoch_val_loss < best_val_loss: best_val_loss = epoch_val_loss epochs_no_improve = 0 # 保存最佳模型 torch.save(model.state_dict(), 'best_credit_model.pth') else: epochs_no_improve += 1 if epochs_no_improve >= patience: print("Early stopping triggered!") break return train_losses, val_losses, train_accs, val_accs # --- 5. 评估与可视化 --- def evaluate_model(model, test_loader, feature_names): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.load_state_dict(torch.load('best_credit_model.pth')) model.to(device) model.eval() y_true = [] y_pred = [] y_scores = [] with torch.no_grad(): for X_batch, y_batch in test_loader: X_batch = X_batch.to(device) outputs = model(X_batch) y_scores.extend(outputs.cpu().numpy()) predicted = (outputs > 0.5).float() y_pred.extend(predicted.cpu().numpy()) y_true.extend(y_batch.numpy()) y_true = np.array(y_true) y_pred = np.array(y_pred) y_scores = np.array(y_scores) # 指标计算 acc = accuracy_score(y_true, y_pred) prec = precision_score(y_true, y_pred) rec = recall_score(y_true, y_pred) auc = roc_auc_score(y_true, y_scores) print("\n--- Test Set Evaluation ---") print(f"Accuracy: {acc:.4f}") print(f"Precision: {prec:.4f}") print(f"Recall: {rec:.4f}") print(f"AUC: {auc:.4f}") # 混淆矩阵 cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(6, 5)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.title('Confusion Matrix') plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix.png') print("Saved confusion_matrix.png") # 可视化第一层权重 (特征重要性近似) # 我们取每个输入特征的权重的绝对值均值来观察其贡献 weights = model.layer1.weight.data.cpu().numpy() feature_importance = np.mean(np.abs(weights), axis=0) # 特征排序 sorted_idx = np.argsort(feature_importance)[-10:] # 取前10个 plt.figure(figsize=(10, 6)) plt.barh(range(10), feature_importance[sorted_idx]) plt.yticks(range(10), feature_names[sorted_idx]) plt.xlabel('Mean Absolute Weight') plt.title('Top 10 Feature Importance (Layer 1 Weights)') plt.savefig('feature_importance.png') print("Saved feature_importance.png") # --- 主程序执行 --- if __name__ == "__main__": # 加载数据 data_path = 'e:\\桌面\\Python60DaysChallenge-main\\data.csv' X_train, y_train, X_val, y_val, X_test, y_test, feature_names = load_and_preprocess_data(data_path) # 创建 DataLoader batch_size = 64 train_dataset = CreditDataset(X_train, y_train) val_dataset = CreditDataset(X_val, y_val) test_dataset = CreditDataset(X_test, y_test) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) # 初始化模型 input_dim = X_train.shape[1] model = CreditNN(input_dim) print(model) # 损失函数和优化器 criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=0.001) # 训练 train_losses, val_losses, train_accs, val_accs = train_model( model, train_loader, val_loader, criterion, optimizer, num_epochs=100, patience=10 ) # 绘制训练历史 plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) plt.plot(train_losses, label='Train Loss') plt.plot(val_losses, label='Val Loss') plt.title('Loss Curve') plt.legend() plt.subplot(1, 2, 2) plt.plot(train_accs, label='Train Acc') plt.plot(val_accs, label='Val Acc') plt.title('Accuracy Curve') plt.legend() plt.savefig('training_history.png') print("Saved training_history.png") # 评估 evaluate_model(model, test_loader, feature_names)

结果如下:

版权声明: 本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若内容造成侵权/违法违规/事实不符,请联系邮箱:809451989@qq.com进行投诉反馈,一经查实,立即删除!
网站建设 2026/6/12 20:29:34

C++的第十四天笔记

存储持续性、作用域和链接性C使用三种&#xff08;C11四种&#xff09;不同方案存储数据。这些方案的区别在于数据保留在内存中的时间。自动存储持续性&#xff1a;在函数定义中声明的变量&#xff08;包括函数参数&#xff09;&#xff0c;程序执行所属函数 / 代码块时创建&am…

作者头像 李华
网站建设 2026/6/12 5:46:16

874-LangChain框架Use-Cases - 基于智能体的动态槽位填充系统 - 案例分析

1. 案例目标 本案例旨在构建一个基于智能体的动态槽位填充系统&#xff0c;实现智能对话系统&#xff0c;能够分析用户请求并自动收集必要信息&#xff0c;通过对话补充缺失信息。 系统主要实现以下目标&#xff1a; 实现动态槽位填充功能&#xff0c;自动识别并收集必要信息…

作者头像 李华
网站建设 2026/6/12 2:46:48

ops-nn算子库生态纵览 - 构建健壮的AI算力基石

目录 &#x1f3af; 摘要 1. ops-nn&#xff1a;CANN神经网络计算的中枢神经系统 1.1 &#x1f504; 算子库的定位与演进轨迹 1.2 &#x1f4ca; 矩阵计算&#xff1a;AI算力的本质洞察 2. NPU硬件架构&#xff1a;算子设计的物理基础 2.1 &#x1f527; AI Core微架构深…

作者头像 李华
网站建设 2026/6/11 10:05:06

基于Java Spring Boot的相机租赁系统的设计与实现-毕业设计源码50424

目录 摘 要 Abstract 第一章 绪 论 1.1 研究背景及意义 1.2 国内外研究现状 1.3 论文组织结构 第二章 关键技术 2.1 Java语言 2.2 MySQL 2.3 SpringBoot框架 2.4 B/S结构概述 第三章 相机租赁系统 系统分析 3.1 系统可行性分析 3.1.1 技术可行性 3.1.2 经济可行…

作者头像 李华
网站建设 2026/6/12 21:19:02

VMware替代 | 解析ZStack Cloud替代VCF基础架构底座路径

从2025年12月1日开始&#xff0c;VMware已经停止在中国销售VMware vSphere Foundation&#xff08;VVF&#xff09;VMware vSphere Enterprise Plus&#xff08;VVEP&#xff09;。这意味着&#xff0c;依赖VMware虚拟化的用户只能转向更昂贵的VMware Cloud Foundation&#xf…

作者头像 李华
网站建设 2026/6/12 23:51:40

[Powershell 入门教程]第10天习题解析

第10天习题解析有没&#xff1f;没有。这里会记录平时我使用Powershell的一些心得&#xff0c;也欢迎大家留言&#xff0c;合适的我会合并到正文中。

作者头像 李华