Spearman检验组间相关性及SCI风格绘图
1. Spearman 秩相关系数
比较组间相关性,适用于非线性关系或非正态分布数据,与 Pearson 的区别:不要求数据线性相关,只关注秩次的单调关系。
2. 数据模拟+Spearman检验+SCI相关性绘图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from matplotlib.patches import Rectangle# 设置字体
#plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams["axes.unicode_minus"] = False # 解决负号显示问题def generate_mutation_data(n_samples=20, random_seed=42):"""生成肿瘤和正常组织的突变数量数据参数:n_samples -- 样本对数random_seed -- 随机种子,确保结果可重现返回:DataFrame -- 包含样本ID、肿瘤突变数和正常突变数的数据框"""np.random.seed(random_seed)# 生成基础变异水平(不同样本的个体差异)base_mutations = np.random.gamma(shape=5, scale=10, size=n_samples)# 肿瘤组织突变数(基础水平 + 肿瘤特异性变异 + 随机噪声)tumor_mutations = base_mutations + np.random.gamma(shape=3, scale=5, size=n_samples) + np.random.normal(0, 5, n_samples)# 正常组织突变数(基础水平 + 随机噪声,通常低于肿瘤)normal_mutations = base_mutations * 0.6 + np.random.normal(0, 8, n_samples)# 确保突变数非负tumor_mutations = np.maximum(1, tumor_mutations).astype(int)normal_mutations = np.maximum(1, normal_mutations).astype(int)# 创建数据框data = pd.DataFrame({'样本ID': [f'S{i+1}' for i in range(n_samples)],'肿瘤组织突变数': tumor_mutations,'正常组织突变数': normal_mutations})return datadef calculate_spearman_correlation(data):"""计算Spearman相关性和p值参数:data -- 包含肿瘤和正常组织突变数的数据框返回:correlation -- Spearman相关系数p_value -- 对应的p值significant -- 是否显著相关(p<0.05)"""x = data['肿瘤组织突变数']y = data['正常组织突变数']correlation, p_value = stats.spearmanr(x, y)significant = p_value < 0.05return correlation, p_value, significantdef plot_correlation(data, correlation, p_value, significant):"""绘制SCI风格的肿瘤和正常组织突变数的散点图及相关性分析结果参数:data -- 包含肿瘤和正常组织突变数的数据框correlation -- Spearman相关系数p_value -- 对应的p值significant -- 是否显著相关"""# 设置SCI风格的绘图参数plt.figure(figsize=(7, 6))#plt.rcParams['font.family'] = 'Arial'plt.rcParams['axes.linewidth'] = 1.5plt.rcParams['xtick.major.width'] = 1.5plt.rcParams['ytick.major.width'] = 1.5plt.rcParams['xtick.major.size'] = 5plt.rcParams['ytick.major.size'] = 5# 绘制散点图(使用更专业的配色)ax = sns.scatterplot(x='肿瘤组织突变数', y='正常组织突变数', data=data,s=40,alpha=0.8,color='#0072B2', # 专业蓝色edgecolor='black',linewidth=0.5)# 添加回归线和95%置信区间sns.regplot(x='肿瘤组织突变数', y='正常组织突变数', data=data,scatter=False,line_kws={'color': '#D55E00', 'lw': 2.0}, # 专业橙色ci=95 # 显示95%置信区间)# 添加坐标轴标题plt.xlabel('Somatic mutations in tumor tissue', fontsize=14, fontweight='bold')plt.ylabel('Somatic mutations in normal tissue', fontsize=14, fontweight='bold')# 添加相关系数和p值标注annotation = (f'Spearman\'s ρ = {correlation:.3f}\n'f'p = {p_value:.3e}\n'f'n = {len(data)} samples')plt.annotate(annotation,xy=(0.03, 0.97),xycoords='axes fraction',fontsize=12,ha='left',va='top',bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='gray', alpha=0.8))# 添加显著性标记(使用星号表示)if p_value < 0.001:significance = '***'elif p_value < 0.01:significance = '**'elif p_value < 0.05:significance = '*'else:significance = 'ns'plt.annotate(significance,xy=(0.95, 0.05),xycoords='axes fraction',fontsize=20,ha='center',va='center',color='#D55E00' if significant else 'gray')# 设置坐标轴范围和刻度#plt.xlim(left=0)#plt.ylim(left=0)# 设置网格(更淡的网格线)plt.grid(True, linestyle='--', alpha=0.3, color='gray')# 调整刻度标签字体大小plt.xticks(fontsize=12)plt.yticks(fontsize=12)# 添加图表边框for spine in ax.spines.values():spine.set_color('black')spine.set_linewidth(1.5)# 优化布局plt.tight_layout()return plt.gcf()def plot_correlation2(data, correlation, p_value, significant):"""绘制肿瘤和正常组织突变数的散点图及相关性分析结果参数:data -- 包含肿瘤和正常组织突变数的数据框correlation -- Spearman相关系数p_value -- 对应的p值significant -- 是否显著相关"""plt.figure(figsize=(10, 8))# 绘制散点图sns.scatterplot(x='肿瘤组织突变数', y='正常组织突变数', data=data,s=50,alpha=0.7,color='#3B82F6')# 添加回归线sns.regplot(x='肿瘤组织突变数', y='正常组织突变数', data=data,scatter=False,line_kws={'color': '#F59E0B', 'lw': 2.5})# 添加标题和轴标签plt.title(f'Spearman correlation analysis of the number of mutations between tumors and normal tissues\n'fr'$\rho$ = {correlation:.3f}, p = {p_value:.3e}',fontsize=16,pad=15)plt.xlabel('Number of tumor tissue mutations', fontsize=14)plt.ylabel('Number of noraml mutations', fontsize=14)# 添加显著性标记significance_text = "Significant" if significant else "Not significant"significance_color = "#10B981" if significant else "#EF4444"plt.gca().add_patch(Rectangle((0.05, 0.92), 0.4, 0.06, transform=plt.gca().transAxes, facecolor=significance_color, alpha=0.2,edgecolor=significance_color,linewidth=1.5,linestyle='--'))plt.text(0.07, 0.94, f'{significance_text} (p < 0.05)', transform=plt.gca().transAxes,fontsize=12,color=significance_color,fontweight='bold')# 添加样本点的ID标签(只标记部分点以避免拥挤)for i, row in data.iterrows():if i % 3 == 0: # 每3个点标记一个plt.annotate(row['样本ID'],(row['肿瘤组织突变数'], row['正常组织突变数']),xytext=(5, 5),textcoords='offset points',fontsize=9,color='#1F2937')# 设置坐标轴范围和网格plt.grid(True, linestyle='--', alpha=0.6)plt.tight_layout()return plt.gcf()def main():# 生成数据,样本数量20data = generate_mutation_data(n_samples=20)# 计算Spearman相关性correlation, p_value, significant = calculate_spearman_correlation(data)print("肿瘤与正常组织突变数量的Spearman相关性分析结果:")print(f"样本数: {len(data)}")print(f"Spearman相关系数 (ρ): {correlation:.4f}")print(f"P值: {p_value:.6f}")print(f"是否显著相关 (p<0.05): {'是' if significant else '否'}")# 数据摘要print("\n数据摘要:")print(data.describe())# 绘制相关性图fig = plot_correlation(data, correlation, p_value, significant)# 保存图像fig.savefig('mutation_correlation.png', dpi=300, bbox_inches='tight')# 显示图形plt.show()if __name__ == "__main__":main()