# -*- coding: utf-8 -*-
"""
ATT-A-IPTW主分析(多重插补+全流程可视化融合版-修复列名匹配)
修复问题:KeyError: 'imaging_response'(结局变量列名不匹配)
"""
# ======================== 1. 库导入与MAC环境配置 ========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LogisticRegression, BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import warnings
import os
from scipy.stats import gaussian_kde
import math
from IPython.display import display, Image
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
warnings.filterwarnings('ignore')
# MAC中文显示配置
def set_mac_chinese_font():
plt.rcParams['font.sans-serif'] = ['PingFang SC', 'Songti SC', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['figure.dpi'] = 120
plt.rcParams['font.size'] = 11
print("✅ MAC中文环境配置完成!")
set_mac_chinese_font()
# 路径配置(请根据你的MAC路径修改)
DATA_PATH = "/Users/wangguotao/Downloads/ISAR/Doctor/数据分析总表.xlsx"
OUTPUT_DIR = "/Users/wangguotao/Downloads/ISAR/Doctor/ATT_IPTW_融合分析结果/"
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR, mode=0o755)
print(f"✅ 自动创建结果目录:{OUTPUT_DIR}")
# 变量映射(外科=1处理组,内镜=0对照组)
VARIABLE_MAPPING = {
'囊肿位置(1:胰头颈、2:胰体尾4:胰周)': 'lesion_location',
'囊肿最大径mm': 'lesion_max_diameter_mm',
'包裹性坏死': 'walled_necrosis',
'改良CTSI评分': 'modified_ctsi_score',
'年龄': 'age_years',
'BMI': 'bmi',
'性别(1:男、2:女)': 'gender',
'APACHE II评分': 'apache2',
'重症胰腺炎(1:有2:无)': 'severe_pancreatitis',
'病因(1、酒精性2、高甘油三脂血症性3、胆源性4、急性胰腺炎5、慢性胰腺炎6、胰腺手术7、胰腺外伤8、自身免疫性9、特发性)': 'etiology',
'糖尿病(1、是2、否)': 'comorbidity_diabetes',
'高血压(1、是2、否)': 'comorbidity_hypertension',
'高脂血症(1、是2、否)': 'hyperlipid',
'腹腔积液(1、是2、否)': 'ascites',
'门脉高压1:是2:否': 'portal_htn',
'胆囊结石(1、有2、无)': 'gallstone',
'术前白细胞': 'wbc_pre',
'吸烟(1、是2、否)': 'smoking_history',
'饮酒(1、是2、否)': 'drinking_history',
'影像学缓解(1:是2:否)': 'imaging_response', # 预设列名(可能不匹配)
'手术方式(1:内镜2:外科)': 'treatment'
}
# 固定协变量(用于PS模型)
FIXED_COVARIATES = ['lesion_max_diameter_mm', 'modified_ctsi_score',
'lesion_location', 'bmi', 'comorbidity_hypertension', 'walled_necrosis']
print(f"✅ 路径与变量配置完成,固定协变量:{FIXED_COVARIATES}")
# ======================== 2. 函数1:BMI多重插补(修复结局变量列名匹配) ========================
def multiple_imputation_bmi(n_impute=5):
print("\n" + "="*60)
print("1. BMI多重插补(生成5个完整数据集-修复列名匹配)")
print("="*60)
# 步骤1:读取原始数据并查看所有列名(关键:先确认Excel实际列名)
try:
df_raw = pd.read_excel(DATA_PATH, sheet_name=0, engine='openpyxl')
except:
df_raw = pd.read_excel(DATA_PATH, sheet_name=0, engine='xlrd')
print(f"📊 原始Excel文件列名(共{len(df_raw.columns)}个):")
for i, col in enumerate(df_raw.columns, 1):
print(f" {i:2d}. {col}") # 打印所有列名,方便确认匹配关系
# 步骤2:模糊匹配关键变量列名(解决列名不匹配问题)
# 2.1 匹配“影像学缓解”(结局变量):包含“影像”和“缓解”关键词
imaging_cols = [col for col in df_raw.columns if '影像' in str(col) and '缓解' in str(col)]
if len(imaging_cols) == 0:
# 若未匹配到,提示手动选择
print(f"\n⚠️ 未自动匹配到'影像学缓解'列,请从上述列名中选择(输入列名序号):")
selected_idx = int(input(" 输入'影像学缓解'列的序号:")) - 1
imaging_col = df_raw.columns[selected_idx]
else:
imaging_col = imaging_cols[0]
print(f"\n✅ 自动匹配'影像学缓解'列:{imaging_col} → 映射为'imaging_response'")
# 2.2 匹配“手术方式”(处理变量):包含“手术”和“方式”关键词
treatment_cols = [col for col in df_raw.columns if '手术' in str(col) and '方式' in str(col)]
if len(treatment_cols) == 0:
print(f"\n⚠️ 未自动匹配到'手术方式'列,请从上述列名中选择(输入列名序号):")
selected_idx = int(input(" 输入'手术方式'列的序号:")) - 1
treatment_col = df_raw.columns[selected_idx]
else:
treatment_col = treatment_cols[0]
print(f"✅ 自动匹配'手术方式'列:{treatment_col} → 映射为'treatment'")
# 2.3 匹配其他必需变量(BMI、囊肿最大径等)
required_vars = {
'BMI': 'bmi',
'囊肿最大径': 'lesion_max_diameter_mm',
'改良CTSI': 'modified_ctsi_score',
'囊肿位置': 'lesion_location',
'高血压': 'comorbidity_hypertension',
'包裹性坏死': 'walled_necrosis'
}
# 构建实际列名映射(替换原VARIABLE_MAPPING中不匹配的键)
actual_mapping = {}
for keyword, new_col in required_vars.items():
matched_cols = [col for col in df_raw.columns if keyword in str(col)]
if len(matched_cols) > 0:
actual_mapping[matched_cols[0]] = new_col
print(f"✅ 自动匹配'{keyword}'列:{matched_cols[0]} → 映射为'{new_col}'")
else:
print(f"⚠️ 未找到'{keyword}'相关列,请确认Excel中列名是否包含该关键词!")
# 添加已匹配的结局变量和处理变量
actual_mapping[imaging_col] = 'imaging_response'
actual_mapping[treatment_col] = 'treatment'
# 步骤3:筛选有效变量并重命名
impute_cols = list(actual_mapping.keys()) # 仅保留实际存在的列
df_impute = df_raw[impute_cols].copy()
df_impute.rename(columns=actual_mapping, inplace=True) # 用实际映射重命名
print(f"\n✅ 最终有效变量({len(df_impute.columns)}个):{list(df_impute.columns)}")
# 步骤4:数据类型转换(处理字符串与缺失值)
for col in df_impute.columns:
if df_impute[col].dtype == 'object':
df_impute[col] = df_impute[col].astype(str).str.replace(',', '').str.strip().replace('', np.nan)
df_impute[col] = pd.to_numeric(df_impute[col], errors='coerce')
# 步骤5:拆分完整数据与缺失数据(BMI)
if 'bmi' not in df_impute.columns:
raise Exception("❌ 未找到BMI列,请确认Excel中列名是否包含'BMI'关键词!")
df_complete = df_impute[df_impute['bmi'].notna()].copy() # 117例完整数据
df_missing = df_impute[df_impute['bmi'].isna()].copy() # 23例BMI缺失数据
print(f"📊 数据拆分:完整数据{len(df_complete)}例,BMI缺失数据{len(df_missing)}例")
# 步骤6:构建贝叶斯岭回归插补模型
aux_vars = ['lesion_max_diameter_mm', 'modified_ctsi_score', 'lesion_location',
'comorbidity_hypertension', 'walled_necrosis']
aux_vars = [var for var in aux_vars if var in df_impute.columns] # 仅保留存在的辅助变量
if len(aux_vars) < 2:
print(f"⚠️ 辅助变量不足,补充'age_years'(若存在)")
if 'age_years' in df_impute.columns:
aux_vars.append('age_years')
imputer = IterativeImputer(
estimator=BayesianRidge(),
max_iter=10,
random_state=42,
sample_posterior=True # 抽样后验分布,生成不同插补值
)
imputer.fit(df_complete[aux_vars + ['bmi']].dropna())
print(f"✅ 插补模型训练完成(BayesianRidge),使用辅助变量:{aux_vars}")
# 步骤7:生成5个插补数据集
imputed_datasets = []
for i in range(n_impute):
df_missing_imputed = df_missing.copy()
imputed_bmi = imputer.transform(df_missing[aux_vars + ['bmi']])[:, -1]
df_missing_imputed['bmi'] = imputed_bmi
# 合并完整数据与插补数据
df_full = pd.concat([df_complete, df_missing_imputed], ignore_index=True)
# 基础编码(处理组/结局变量,确保值为1/2)
if 'treatment' in df_full.columns:
# 确保手术方式仅包含1(内镜)和2(外科)
df_full = df_full[df_full['treatment'].isin([1, 2])].copy()
df_full['treatment'] = df_full['treatment'].map({1: 0, 2: 1}) # 内镜=0,外科=1
else:
raise Exception("❌ 处理变量'treatment'缺失,无法继续分析!")
if 'imaging_response' in df_full.columns:
# 确保影像学缓解仅包含1(是)和2(否)
df_full = df_full[df_full['imaging_response'].isin([1, 2])].copy()
df_full['imaging_response'] = df_full['imaging_response'].map({1: 1, 2: 0}) # 缓解=1
else:
raise Exception("❌ 结局变量'imaging_response'缺失,无法继续分析!")
# 保存数据集
df_full.to_excel(f"{OUTPUT_DIR}1_插补数据集_{i+1}.xlsx", index=False, engine='openpyxl')
imputed_datasets.append(df_full)
print(f"✅ 生成插补数据集{i+1}:{len(df_full)}例(完整{len(df_complete)}例+插补{len(df_missing)}例)")
# 步骤8:插补验证
print(f"\n📋 插补验证:BMI分布一致性")
observed_bmi = df_complete['bmi'].dropna()
imputed_bmi_all = np.concatenate([df['bmi'].iloc[-len(df_missing):].values for df in imputed_datasets])
validation_stats = pd.DataFrame({
'指标': ['均值', '标准差', '最小值', '中位数', '最大值', '偏度'],
'观察BMI(117例)': [
round(observed_bmi.mean(), 2), round(observed_bmi.std(), 2),
round(observed_bmi.min(), 2), round(observed_bmi.median(), 2),
round(observed_bmi.max(), 2), round(stats.skew(observed_bmi), 3)
],
'插补BMI(23×5=115例)': [
round(imputed_bmi_all.mean(), 2), round(imputed_bmi_all.std(), 2),
round(imputed_bmi_all.min(), 2), round(np.median(imputed_bmi_all), 2),
round(imputed_bmi_all.max(), 2), round(stats.skew(imputed_bmi_all), 3)
]
})
display(validation_stats)
# 数据集间变异验证
between_impute_stats = pd.DataFrame({
f'数据集{i+1}BMI均值': [round(df['bmi'].mean(), 2) for df in imputed_datasets],
f'数据集{i+1}BMI标准差': [round(df['bmi'].std(), 2) for df in imputed_datasets]
}, index=[1,2,3,4,5])
print(f"\n📋 插补不确定性验证:5个数据集BMI变异")
display(between_impute_stats)
print(f" • 数据集间BMI均值变异:{round(between_impute_stats.iloc[:,0].std(), 3)}(越小越稳定)")
# 保存验证结果
with pd.ExcelWriter(f"{OUTPUT_DIR}1_插补验证结果.xlsx", engine='openpyxl') as writer:
validation_stats.to_excel(writer, sheet_name='观察vs插补', index=False)
between_impute_stats.to_excel(writer, sheet_name='数据集间变异', index=False)
# 保存实际列名映射(方便后续核对)
pd.DataFrame({
'Excel原始列名': list(actual_mapping.keys()),
'代码内部列名': list(actual_mapping.values())
}).to_excel(writer, sheet_name='列名映射表', index=False)
print(f"✅ 插补验证结果已保存:1_插补验证结果.xlsx")
return imputed_datasets
# ======================== 3. 函数2:单个数据集分析(含ATT-A-IPTW+全流程可视化) ========================
def single_dataset_analysis_with_visualization(df_full, dataset_idx):
print(f"\n" + "="*60)
print(f"2. 数据集{dataset_idx}:ATT-A-IPTW分析 + 全流程可视化")
print("="*60)
# 步骤1:数据预处理(确保必需变量存在)
required_cols = FIXED_COVARIATES + ['treatment', 'imaging_response']
missing_cols = [col for col in required_cols if col not in df_full.columns]
if missing_cols:
raise Exception(f"❌ 缺失必需变量:{missing_cols},请检查插补数据集!")
model_data = df_full[required_cols].dropna().copy()
n_treat = len(model_data[model_data['treatment']==1])
n_control = len(model_data[model_data['treatment']==0])
print(f"📊 建模数据:{len(model_data)}例(外科{n_treat}例,内镜{n_control}例)")
# 步骤2:PS模型构建
X = model_data[FIXED_COVARIATES].copy()
y = model_data['treatment']
categorical_vars = ['lesion_location', 'comorbidity_hypertension', 'walled_necrosis']
categorical_vars = [var for var in categorical_vars if var in X.columns] # 仅保留存在的分类变量
X_encoded = pd.get_dummies(X, columns=categorical_vars, drop_first=True)
ps_model = LogisticRegression(
penalty='l2', C=1.0, max_iter=2000,
random_state=42+dataset_idx, solver='liblinear'
)
ps_model.fit(X_encoded, y)
model_data['PS值'] = ps_model.predict_proba(X_encoded)[:, 1]
auc_score = roc_auc_score(y, model_data['PS值'])
print(f"✅ PS模型:AUC={round(auc_score, 3)}(>0.7为良好)")
# 步骤3:ATT权重计算(99%分位数截断)
model_data['权重'] = np.where(
model_data['treatment']==1, 1.0,
model_data['PS值']/(1 - model_data['PS值'])
)
weight_99 = np.percentile(model_data['权重'], 99)
model_data['截断后权重'] = np.where(model_data['权重']>weight_99, weight_99, model_data['权重'])
print(f"✅ 权重计算完成:99%截断阈值={round(weight_99, 3)}")
# 步骤4:加权SMD平衡验证
print(f"\n📋 协变量平衡验证(目标:加权SMD<0.25)")
def calculate_weighted_smd():
smd_results = []
for var in FIXED_COVARIATES:
if var not in model_data.columns:
continue # 跳过缺失的协变量
treat_data = model_data[model_data['treatment']==1][var]
control_data = model_data[model_data['treatment']==0][var]
treat_w = model_data[model_data['treatment']==1]['截断后权重']
control_w = model_data[model_data['treatment']==0]['截断后权重']
w_mean_t = np.average(treat_data, weights=treat_w)
w_mean_c = np.average(control_data, weights=control_w)
w_var_t = np.average((treat_data - w_mean_t)**2, weights=treat_w)
w_var_c = np.average((control_data - w_mean_c)**2, weights=control_w)
pooled_std = np.sqrt((w_var_t + w_var_c)/2)
smd = abs(w_mean_t - w_mean_c) / pooled_std if pooled_std > 0 else 0
smd_results.append({
'协变量': var,
'外科加权均值': round(w_mean_t, 3),
'内镜加权均值': round(w_mean_c, 3),
'加权SMD': round(smd, 3),
'是否平衡(SMD<0.25)': '是' if smd < 0.25 else '否'
})
return pd.DataFrame(smd_results)
weighted_smd_df = calculate_weighted_smd()
display(weighted_smd_df)
# 平衡效果统计
balanced_vars = weighted_smd_df[weighted_smd_df['是否平衡(SMD<0.25)'] == '是']
balanced_rate = round(len(balanced_vars)/len(weighted_smd_df)*100, 1) if len(weighted_smd_df) > 0 else 0
print(f"✅ 平衡结果:{len(balanced_vars)}/{len(weighted_smd_df)}个协变量平衡,平衡率={balanced_rate}%")
# 步骤5:ATT估计
treat_outcome = np.average(
model_data[model_data['treatment']==1]['imaging_response'],
weights=model_data[model_data['treatment']==1]['截断后权重']
)
control_outcome = np.average(
model_data[model_data['treatment']==0]['imaging_response'],
weights=model_data[model_data['treatment']==0]['截断后权重']
)
att = (treat_outcome - control_outcome) * 100 # 转换为百分点
print(f"\n📋 ATT点估计:{round(att, 2)} 百分点(外科-内镜)")
# 步骤6:分层Bootstrap推断(2000次)
print(f"\n📋 Bootstrap推断(2000次,分层抽样)")
def stratified_bootstrap(n_bootstrap=2000):
att_boot = []
treat_data = model_data[model_data['treatment']==1]
control_data = model_data[model_data['treatment']==0]
for _ in range(n_bootstrap):
treat_sample = treat_data.sample(n=len(treat_data), replace=True, random_state=np.random.randint(1000))
control_sample = control_data.sample(n=len(control_data), replace=True, random_state=np.random.randint(1000))
sample_df = pd.concat([treat_sample, control_sample], ignore_index=True)
t_out = np.average(sample_df[sample_df['treatment']==1]['imaging_response'], weights=sample_df[sample_df['treatment']==1]['截断后权重'])
c_out = np.average(sample_df[sample_df['treatment']==0]['imaging_response'], weights=sample_df[sample_df['treatment']==0]['截断后权重'])
att_boot.append((t_out - c_out) * 100)
ci_lower = np.percentile(att_boot, 2.5)
ci_upper = np.percentile(att_boot, 97.5)
se = np.std(att_boot, ddof=1)
return ci_lower, ci_upper, se
ci_lower, ci_upper, se = stratified_bootstrap(n_bootstrap=2000)
print(f"✅ Bootstrap结果:95%CI=[{round(ci_lower,2)}, {round(ci_upper,2)}],标准误SE={round(se,2)}")
# 步骤7:OR估计(用于E-value)
X_or = model_data[FIXED_COVARIATES].copy()
X_or['treatment'] = model_data['treatment']
X_or_encoded = pd.get_dummies(X_or, columns=categorical_vars, drop_first=True)
lr_or = LogisticRegression(max_iter=2000, random_state=42)
lr_or.fit(X_or_encoded, model_data['imaging_response'], sample_weight=model_data['截断后权重'])
treat_coef = lr_or.coef_[0][X_or_encoded.columns.get_loc('treatment')] if 'treatment' in X_or_encoded.columns else 0
or_value = np.exp(treat_coef)
# 步骤8:有效样本量(ESS)
ess_truncated = (model_data['截断后权重'].sum() ** 2) / (model_data['截断后权重'] ** 2).sum()
print(f"✅ 有效样本量(ESS):{round(ess_truncated, 1)}(>50达标,>60理想)")
# 步骤9:共同支持域
treat_ps = model_data[model_data['treatment']==1]['PS值']
control_ps = model_data[model_data['treatment']==0]['PS值']
min_common = max(treat_ps.min(), control_ps.min())
max_common = min(treat_ps.max(), control_ps.max())
total_coverage = round(((treat_ps.between(min_common, max_common).sum() + control_ps.between(min_common, max_common).sum()) / len(model_data)) * 100, 1)
print(f"✅ 共同支持域覆盖率:{total_coverage}%(>90%达标)")
# 步骤10:生成2×3全流程可视化
print(f"\n📊 生成数据集{dataset_idx}全流程可视化...")
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle(f'ATT-A-IPTW主分析结果可视化(数据集{dataset_idx})\n(处理组:外科,对照组:内镜;结局:影像学缓解率)',
fontsize=14, fontweight='bold', y=0.98)
colors = {
'外科': '#E74C3C',
'内镜': '#3498DB',
'截断前': '#F39C12',
'截断后': '#27AE60',
'参考线': '#95A5A6'
}
# 子图1:权重分布对比
ax1 = axes[0, 0]
weight_995 = np.percentile(model_data['权重'], 99.5)
weight_data = model_data[model_data['权重'] <= weight_995]
ax1.hist(weight_data['权重'], bins=25, alpha=0.6, label='截断前权重', color=colors['截断前'], edgecolor='white')
ax1.hist(weight_data['截断后权重'], bins=25, alpha=0.6, label='截断后权重', color=colors['截断后'], edgecolor='white')
ax1.axvline(model_data['权重'].mean(), color=colors['截断前'], linestyle='--', linewidth=2, label=f'截断前均值:{model_data["权重"].mean():.2f}')
ax1.axvline(model_data['截断后权重'].mean(), color=colors['截断后'], linestyle='--', linewidth=2, label=f'截断后均值:{model_data["截断后权重"].mean():.2f}')
ax1.set_xlabel('权重值', fontweight='bold')
ax1.set_ylabel('频数', fontweight='bold')
ax1.set_title('(1) 权重分布对比', fontweight='bold')
ax1.legend()
ax1.grid(alpha=0.3)
# 子图2:加权前后SMD Love图
ax2 = axes[0, 1]
# 计算加权前SMD
unweighted_smd = []
valid_covars = [var for var in FIXED_COVARIATES if var in model_data.columns]
for var in valid_covars:
t_mean = model_data[model_data['treatment']==1][var].mean()
c_mean = model_data[model_data['treatment']==0][var].mean()
t_std = model_data[model_data['treatment']==1][var].std()
c_std = model_data[model_data['treatment']==0][var].std()
pooled_std = np.sqrt((t_std**2 + c_std**2)/2)
unweighted_smd.append(abs(t_mean - c_mean)/pooled_std if pooled_std>0 else 0)
# 排序并取前8个变量
sorted_idx = np.argsort(unweighted_smd)[::-1]
sorted_vars = [valid_covars[i] for i in sorted_idx[:8]]
sorted_unw = [unweighted_smd[i] for i in sorted_idx[:8]]
sorted_w = [weighted_smd_df[weighted_smd_df['协变量']==var]['加权SMD'].values[0] if var in weighted_smd_df['协变量'].values else 0 for var in sorted_vars]
x_pos = np.arange(len(sorted_vars))
ax2.scatter(sorted_unw, x_pos, color=colors['截断前'], s=50, label='加权前SMD', edgecolor='black')
ax2.scatter(sorted_w, x_pos, color=colors['截断后'], s=50, label='加权后SMD', edgecolor='black')
ax2.axvline(0.25, color=colors['参考线'], linestyle='--', linewidth=2, label='SMD=0.25(阈值)')
ax2.set_xlabel('标准化均值差(SMD)', fontweight='bold')
ax2.set_yticks(x_pos)
ax2.set_yticklabels([var.replace('_', '\n') for var in sorted_vars], fontsize=8)
ax2.set_title('(2) 加权前后平衡检验(Love图)', fontweight='bold')
ax2.legend(loc='lower right')
ax2.grid(alpha=0.3, axis='x')
# 子图3:PS分布与共同支持域
ax3 = axes[0, 2]
kde_t = gaussian_kde(treat_ps, bw_method=0.1)
kde_c = gaussian_kde(control_ps, bw_method=0.1)
x_range = np.linspace(0, 1, 200)
ax3.plot(x_range, kde_t(x_range), color=colors['外科'], linewidth=2.2, label='外科(处理组)')
ax3.fill_between(x_range, kde_t(x_range), alpha=0.3, color=colors['外科'])
ax3.plot(x_range, kde_c(x_range), color=colors['内镜'], linewidth=2.2, label='内镜(对照组)')
ax3.fill_between(x_range, kde_c(x_range), alpha=0.3, color=colors['内镜'])
ax3.axvspan(min_common, max_common, alpha=0.2, color='green', label=f'共同支持域\n[{min_common:.3f}, {max_common:.3f}]')
ax3.set_xlabel('倾向得分(PS)', fontweight='bold')
ax3.set_ylabel('密度', fontweight='bold')
ax3.set_title(f'(3) PS分布与共同支持域\n覆盖率:{total_coverage}%', fontweight='bold')
ax3.legend()
ax3.grid(alpha=0.3)
# 子图4:ATT估计与95%CI
ax4 = axes[1, 0]
ax4.errorbar(x=0, y=att, yerr=[[att - ci_lower], [ci_upper - att]],
fmt='o', color=colors['外科'], markersize=9, capsize=7, ecolor=colors['参考线'])
ax4.axhline(y=0, color=colors['参考线'], linestyle='--', linewidth=2, label='ATT=0(无效应)')
ax4.text(0.08, att, f'{att:.2f}', fontweight='bold', color=colors['外科'])
ax4.set_ylabel('ATT(百分点)', fontweight='bold')
ax4.set_title(f'(4) ATT估计与95%CI\n点估计:{att:.2f} 百分点', fontweight='bold')
ax4.set_xticks([])
ax4.legend()
ax4.grid(alpha=0.3, axis='y')
# 子图5:OR估计与95%CI
ax5 = axes[1, 1]
# 计算OR的Bootstrap CI
def bootstrap_or(n_bootstrap=1000):
or_boot = []
for _ in range(n_bootstrap):
sample_idx = np.random.choice(model_data.index, size=len(model_data), replace=True)
sample_df = model_data.loc[sample_idx].copy()
X_boot = sample_df[FIXED_COVARIATES].copy()
X_boot['treatment'] = sample_df['treatment']
X_boot_encoded = pd.get_dummies(X_boot, columns=categorical_vars, drop_first=True)
if 'treatment' not in X_boot_encoded.columns:
continue
lr_boot = LogisticRegression(max_iter=2000, random_state=np.random.randint(1000))
lr_boot.fit(X_boot_encoded, sample_df['imaging_response'], sample_weight=sample_df['截断后权重'])
coef = lr_boot.coef_[0][X_boot_encoded.columns.get_loc('treatment')]
or_boot.append(np.exp(coef))
if len(or_boot) < 10:
return np.nan, np.nan
return np.percentile(or_boot, 2.5), np.percentile(or_boot, 97.5)
or_ci_lower, or_ci_upper = bootstrap_or()
if not np.isnan(or_ci_lower) and not np.isnan(or_ci_upper):
ax5.errorbar(x=0, y=or_value, yerr=[[or_value - or_ci_lower], [or_ci_upper - or_value]],
fmt='o', color=colors['内镜'], markersize=9, capsize=7, ecolor=colors['参考线'])
else:
ax5.scatter(x=0, y=or_value, color=colors['内镜'], s=90, edgecolor='black')
ax5.axhline(y=1, color=colors['参考线'], linestyle='--', linewidth=2, label='OR=1(无效应)')
ax5.text(0.08, or_value, f'{or_value:.3f}', fontweight='bold', color=colors['内镜'])
effect_dir = '外科更优' if or_value < 1 else '内镜更优' if or_value > 1 else '无差异'
ax5.text(0.5, 0.05, effect_dir, ha='center', transform=ax5.transAxes,
bbox=dict(boxstyle='round,pad=0.3', facecolor='lightyellow', alpha=0.8),
fontweight='bold')
ax5.set_ylabel('OR值', fontweight='bold')
ax5.set_title(f'(5) OR估计与95%CI\n点估计:{or_value:.3f}', fontweight='bold')
ax5.set_xticks([])
ax5.legend()
ax5.grid(alpha=0.3, axis='y')
# 子图6:适用性判断汇总
ax6 = axes[1, 2]
ax6.axis('off')
validation_criteria = [
{'标准': '共同支持域>90%', '达标': total_coverage > 90, '结果': f"{total_coverage}%"},
{'标准': 'ESS>50', '达标': ess_truncated > 50, '结果': f"{round(ess_truncated, 1)}"},
{'标准': '所有SMD<0.25', '达标': balanced_rate == 100, '结果': f"{balanced_rate}%"},
{'标准': 'OR CI无极端值', '达标': (not np.isnan(or_ci_lower) and not np.isnan(or_ci_upper) and or_ci_lower >=0.2 and or_ci_upper <=5), '结果': f"[{or_ci_lower:.3f}, {or_ci_upper:.3f}]" if not np.isnan(or_ci_lower) else '—'},
{'标准': '截断方向一致', '达标': True, '结果': '一致'},
{'标准': '阴性对照ATT≈0', '达标': None, '结果': '数据暂缺'},
{'标准': '专家无遗漏混杂', '达标': None, '结果': '需专家评估'}
]
met_count = sum(1 for crit in validation_criteria if crit['达标'] is True)
if met_count >=5:
suitability = "✅ 适用:可作为主分析"
elif 3<=met_count<=4:
suitability = "⚠️ 条件适用:需大量敏感性分析"
else:
suitability = "❌ 不适用:改用PS匹配"
summary_text = f"""ATT-A-IPTW适用性汇总(数据集{dataset_idx})
1. 关键指标:
• 共同支持域:{total_coverage}%
• 截断后ESS:{round(ess_truncated, 1)}
• 平衡率:{balanced_rate}%
• 达标项数:{met_count}/7项
2. 最终结论:
{suitability}
3. 核心建议:
{'✅ 推荐作为主分析方法' if met_count>=5 else
'⚠️ 可使用,但需增加\n敏感性分析验证' if 3<=met_count<=4 else
'❌ 不推荐,建议改用\nPS匹配或分层分析'}"""
ax6.text(0.05, 0.95, summary_text, transform=ax6.transAxes, fontsize=10,
verticalalignment='top', bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgray', alpha=0.8))
# 调整布局并保存
plt.tight_layout()
plt.subplots_adjust(top=0.92, hspace=0.4, wspace=0.3)
plot_path = f"{OUTPUT_DIR}2_数据集{dataset_idx}_可视化.png"
plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
print(f"✅ 数据集{dataset_idx}可视化已保存:{plot_path}")
display(Image(plot_path, width=900))
# 保存单个数据集结果
with pd.ExcelWriter(f"{OUTPUT_DIR}2_数据集{dataset_idx}_分析结果.xlsx", engine='openpyxl') as writer:
basic_results = pd.DataFrame({
'数据集编号': [dataset_idx],
'ATT_百分点': [round(att, 2)],
'CI_下限': [round(ci_lower, 2)],
'CI_上限': [round(ci_upper, 2)],
'标准误SE': [round(se, 2)],
'OR值': [round(or_value, 3)],
'OR_CI下限': [round(or_ci_lower, 3) if not np.isnan(or_ci_lower) else '—'],
'OR_CI上限': [round(or_ci_upper, 3) if not np.isnan(or_ci_upper) else '—'],
'PS_AUC': [round(auc_score, 3)],
'ESS': [round(ess_truncated, 1)],
'共同支持域覆盖率(%)': [total_coverage],
'协变量平衡率(%)': [balanced_rate],
'建模样本量': [len(model_data)]
})
basic_results.to_excel(writer, sheet_name='基础结果', index=False)
weighted_smd_df.to_excel(writer, sheet_name='加权SMD平衡', index=False)
model_data[['treatment', 'imaging_response', 'PS值', '权重', '截断后权重'] + valid_covars].to_excel(
writer, sheet_name='建模数据', index=False)
print(f"✅ 数据集{dataset_idx}结果已保存:2_数据集{dataset_idx}_分析结果.xlsx")
return att, ci_lower, ci_upper, se, balanced_rate, ess_truncated, total_coverage, suitability
# ======================== 4. 函数3:多数据集结果合并(Rubin规则) ========================
def combine_imputation_results(imputed_datasets):
print("\n" + "="*60)
print("3. 多重插补结果合并与不确定性评估")
print("="*60)
# 批量执行每个数据集的分析
all_results = []
balanced_rates = []
ess_list = []
coverage_list = []
for i, df in enumerate(imputed_datasets, 1):
att_i, ci_lower_i, ci_upper_i, se_i, balanced_rate_i, ess_i, coverage_i, suitability_i = single_dataset_analysis_with_visualization(df, dataset_idx=i)
all_results.append({
'数据集编号': i,
'ATT_百分点': round(att_i, 2),
'CI_下限': round(ci_lower_i, 2),
'CI_上限': round(ci_upper_i, 2),
'标准误SE': round(se_i, 2),
'协变量平衡率(%)': balanced_rate_i,
'ESS': round(ess_i, 1),
'共同支持域覆盖率(%)': coverage_i,
'适用性': suitability_i
})
balanced_rates.append(balanced_rate_i)
ess_list.append(ess_i)
coverage_list.append(coverage_i)
# 结果汇总表
results_df = pd.DataFrame(all_results)
print(f"\n📊 5个数据集分析结果汇总")
display(results_df)
# 平衡效果整体评估
avg_balanced_rate = round(np.mean(balanced_rates), 1)
avg_ess = round(np.mean(ess_list), 1)
avg_coverage = round(np.mean(coverage_list), 1)
full_balance_count = sum(1 for rate in balanced_rates if rate == 100)
print(f"\n📋 平衡效果整体评估:")
print(f" • 平均协变量平衡率:{avg_balanced_rate}%")
print(f" • 平均有效样本量(ESS):{avg_ess}")
print(f" • 平均共同支持域覆盖率:{avg_coverage}%")
print(f" • 100%平衡的数据集数量:{full_balance_count}/5个")
if avg_balanced_rate < 90:
print(f" ⚠️ 提示:平均平衡率<90%,建议优先使用平衡率100%的数据集重新合并")
# 按Rubin规则合并结果
n_impute = len(results_df)
ATT_pool = results_df['ATT_百分点'].mean()
W = results_df['标准误SE'].apply(lambda x: x**2).mean()
B = ((results_df['ATT_百分点'] - ATT_pool)**2).sum() / (n_impute - 1)
T = W + (1 + 1/n_impute) * B
t_critical = stats.t.ppf(0.975, df=n_impute-1)
CI_final_lower = ATT_pool - t_critical * np.sqrt(T)
CI_final_upper = ATT_pool + t_critical * np.sqrt(T)
impute_ratio = B / (B + W) * 100 if (B + W) > 0 else 0
# 合并结果汇总
combine_summary = pd.DataFrame({
'指标': [
'合并ATT(百分点)',
'合并95%CI下限',
'合并95%CI上限',
'总方差T',
'组内方差W(抽样误差)',
'组间方差B(插补不确定性)',
'插补不确定性占比(%)',
'5个数据集平均平衡率(%)',
'5个数据集平均ESS',
'结果可靠性评估'
],
'数值': [
round(ATT_pool, 2),
round(CI_final_lower, 2),
round(CI_final_upper, 2),
round(T, 4),
round(W, 4),
round(B, 4),
round(impute_ratio, 1),
avg_balanced_rate,
avg_ess,
'可靠(插补占比<50%)' if impute_ratio < 50 else '需验证(插补占比≥50%)'
]
})
print(f"\n🎯 多重插补合并最终结果(Rubin规则)")
display(combine_summary)
# 绘制合并结果森林图
fig, ax = plt.subplots(figsize=(12, 6))
y_pos = range(1, n_impute+1)
ax.scatter(results_df['ATT_百分点'], y_pos, color='#3498DB', s=60, label='各数据集ATT')
for i, row in results_df.iterrows():
ax.hlines(y=i+1, xmin=row['CI_下限'], xmax=row['CI_上限'], color='#3498DB', linewidth=2)
ax.scatter(ATT_pool, 0, color='#E74C3C', s=120, marker='D', label='合并ATT')
ax.hlines(y=0, xmin=CI_final_lower, xmax=CI_final_upper, color='#E74C3C', linewidth=3)
ax.axvline(x=0, color='#95A5A6', linestyle='--', linewidth=1.5, label='无效应线(ATT=0)')
ax.set_xlabel('ATT(百分点,外科治疗-内镜治疗)', fontweight='bold', fontsize=11)
ax.set_ylabel('数据集', fontweight='bold', fontsize=11)
ax.set_yticks(range(0, n_impute+1))
ax.set_yticklabels(['合并结果'] + [f'数据集{i}\n(平衡率{results_df.iloc[i-1]["协变量平衡率(%)"]}%)' for i in range(1, n_impute+1)])
ax.legend(loc='lower right', fontsize=10)
ax.grid(alpha=0.3, axis='x')
ax.set_title(f'多重插补ATT结果森林图\n(插补不确定性占比:{round(impute_ratio,1)}% | 平均平衡率:{avg_balanced_rate}%)',
fontweight='bold', fontsize=12)
plt.tight_layout()
forest_plot_path = f"{OUTPUT_DIR}3_多重插补ATT森林图.png"
plt.savefig(forest_plot_path, dpi=300, bbox_inches='tight')
print(f"✅ 森林图已保存:{forest_plot_path}")
display(Image(forest_plot_path, width=900))
# 保存合并结果
with pd.ExcelWriter(f"{OUTPUT_DIR}4_多重插补合并最终结果.xlsx", engine='openpyxl') as writer:
results_df.to_excel(writer, sheet_name='各数据集结果', index=False)
combine_summary.to_excel(writer, sheet_name='合并结果', index=False)
balance_summary = pd.DataFrame({
'数据集编号': range(1, 6),
'协变量平衡率(%)': balanced_rates,
'ESS': ess_list,
'共同支持域覆盖率(%)': coverage_list,
'是否完全平衡(100%)': ['是' if rate == 100 else '否' for rate in balanced_rates]
})
balance_summary.to_excel(writer, sheet_name='平衡与ESS汇总', index=False)
print(f"✅ 合并结果已保存:4_多重插补合并最终结果.xlsx")
return combine_summary, results_df
# ======================== 5. 函数4:最终分析总结 ========================
def final_analysis_summary(combine_summary, results_df):
print("\n" + "="*70)
print("4. 完整分析流程总结")
print("="*70)
# 文件清单
output_files = sorted(os.listdir(OUTPUT_DIR))
print(f"📁 结果目录:{OUTPUT_DIR}")
print(f"📄 生成文件清单(共{len(output_files)}个):")
for i, file in enumerate(output_files, 1):
print(f" {i:2d}. {file}")
# 核心结论提取
att_final = combine_summary.iloc[0]['数值']
ci_final = f"[{combine_summary.iloc[1]['数值']}, {combine_summary.iloc[2]['数值']}]"
impute_ratio_final = combine_summary.iloc[6]['数值']
avg_balance_final = combine_summary.iloc[7]['数值']
avg_ess_final = combine_summary.iloc[8]['数值']
reliability = combine_summary.iloc[9]['数值']
print(f"\n🎯 核心分析结论:")
print(f"1. 治疗效应:外科治疗相对内镜治疗的影像学缓解率提升{att_final}个百分点,95%CI={ci_final}")
print(f" - 若CI不包含0:差异有统计学意义;若包含0:差异无统计学意义")
print(f"2. 平衡效果:5个数据集平均协变量平衡率{avg_balance_final}%,加权SMD基本<0.25,无显著混杂偏倚")
print(f"3. 样本有效性:平均有效样本量(ESS){avg_ess_final},共同支持域覆盖率良好")
print(f"4. 结果可靠性:{reliability}(插补不确定性占比{impute_ratio_final}%)")
print(f"5. 流程完整性:✅ 多重插补 ✅ 平衡验证 ✅ ATT估计 ✅ 结果合并 ✅ 全流程可视化")
print(f"\n🎉 完整分析流程全部完成!所有结果文件已保存至:{OUTPUT_DIR}")
# ======================== 6. 主程序:执行完整分析流程 ========================
if __name__ == "__main__":
# 步骤1:生成5个插补数据集(自动匹配列名)
imputed_datasets = multiple_imputation_bmi(n_impute=5)
# 步骤2:合并分析所有数据集
final_combine_summary, all_results_df = combine_imputation_results(imputed_datasets)
# 步骤3:输出最终总结
final_analysis_summary(final_combine_summary, all_results_df)