bio-proteomics-proteomics-qc

star 30

Quality control and assessment for proteomics data. Use when evaluating proteomics data quality before downstream analysis. Covers sample metrics, missing value patterns, replicate correlation, batch effects, and intensity distributions.

mdbabumiamssm By mdbabumiamssm schedule Updated 2/4/2026

name: bio-proteomics-proteomics-qc description: Quality control and assessment for proteomics data. Use when evaluating proteomics data quality before downstream analysis. Covers sample metrics, missing value patterns, replicate correlation, batch effects, and intensity distributions. tool_type: mixed primary_tool: pandas measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools: - read_file - run_shell_command

Proteomics Quality Control

Sample Quality Metrics

import pandas as pd
import numpy as np

def sample_qc_metrics(intensity_matrix):
    '''Calculate per-sample QC metrics'''
    metrics = pd.DataFrame(index=intensity_matrix.columns)
    metrics['n_proteins'] = intensity_matrix.notna().sum()
    metrics['median_intensity'] = intensity_matrix.median()
    metrics['mean_intensity'] = intensity_matrix.mean()
    metrics['cv'] = intensity_matrix.std() / intensity_matrix.mean()
    metrics['missing_pct'] = 100 * intensity_matrix.isna().sum() / len(intensity_matrix)
    return metrics

qc = sample_qc_metrics(log2_intensities)
print(qc)

Replicate Correlation

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

def replicate_correlation(intensity_matrix, sample_groups):
    '''Calculate within-group correlations'''
    corr_matrix = intensity_matrix.corr(method='pearson')

    # Mask for within-group comparisons
    results = []
    for group in sample_groups.unique():
        group_samples = sample_groups[sample_groups == group].index
        for i, s1 in enumerate(group_samples):
            for s2 in group_samples[i+1:]:
                r = corr_matrix.loc[s1, s2]
                results.append({'group': group, 'sample1': s1, 'sample2': s2, 'correlation': r})

    return pd.DataFrame(results)

# Heatmap
sns.clustermap(intensity_matrix.corr(), cmap='RdBu_r', center=0, vmin=-1, vmax=1,
               figsize=(10, 10), annot=False)
plt.savefig('correlation_heatmap.pdf')

Missing Value Patterns

import missingno as msno

def analyze_missing_patterns(intensity_matrix):
    '''Analyze missing value patterns'''
    # Missing value matrix visualization
    msno.matrix(intensity_matrix, figsize=(12, 8))
    plt.savefig('missing_pattern.pdf')

    # Missing by sample
    missing_per_sample = intensity_matrix.isna().sum() / len(intensity_matrix) * 100

    # Missing by protein
    missing_per_protein = intensity_matrix.isna().sum(axis=1) / intensity_matrix.shape[1] * 100

    # Check for systematic patterns
    return {'per_sample': missing_per_sample, 'per_protein': missing_per_protein}

Batch Effect Detection with PCA

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def detect_batch_effects(intensity_matrix, sample_info, batch_col='batch'):
    '''PCA to detect batch effects'''
    # Impute for PCA (temporary)
    imputed = intensity_matrix.fillna(intensity_matrix.median())
    scaled = StandardScaler().fit_transform(imputed.T)

    pca = PCA(n_components=5)
    pcs = pca.fit_transform(scaled)
    pc_df = pd.DataFrame(pcs, columns=[f'PC{i+1}' for i in range(5)], index=intensity_matrix.columns)
    pc_df = pc_df.join(sample_info)

    # Check batch association with PCs
    from scipy.stats import f_oneway
    for pc in ['PC1', 'PC2', 'PC3']:
        groups = [pc_df[pc_df[batch_col] == b][pc] for b in pc_df[batch_col].unique()]
        stat, pval = f_oneway(*groups)
        print(f'{pc} ~ {batch_col}: F={stat:.2f}, p={pval:.4f}')

    return pc_df, pca.explained_variance_ratio_

R: QC with limma

library(limma)
library(ggplot2)

# Intensity distribution
plotDensities(protein_matrix, legend = FALSE, main = 'Intensity Distributions')

# MA plots between samples
for (i in 2:ncol(protein_matrix)) {
    plotMA(protein_matrix[, c(1, i)], main = paste('MA:', colnames(protein_matrix)[i]))
}

# MDS plot (similar to PCA)
plotMDS(protein_matrix, col = as.numeric(sample_info$condition))

Coefficient of Variation

def calculate_cv(intensity_matrix, sample_groups):
    '''Calculate CV within groups'''
    cv_results = []
    for group in sample_groups.unique():
        group_samples = sample_groups[sample_groups == group].index
        group_data = intensity_matrix[group_samples]

        # CV per protein
        cv = group_data.std(axis=1) / group_data.mean(axis=1) * 100
        cv_results.append({'group': group, 'median_cv': cv.median(), 'mean_cv': cv.mean()})

    return pd.DataFrame(cv_results)

# Technical replicates should have CV < 20%
# Biological replicates typically 20-40%

Digestion Efficiency

def check_digestion(evidence_df):
    '''Check digestion efficiency from MaxQuant evidence.txt'''
    # Missed cleavages distribution
    mc_dist = evidence_df['Missed cleavages'].value_counts(normalize=True) * 100
    print('Missed cleavage distribution:')
    print(mc_dist)

    # Good digestion: >80% with 0 missed cleavages
    if mc_dist.get(0, 0) < 80:
        print('Warning: Poor digestion efficiency (<80% fully cleaved)')

    return mc_dist

QC Report Summary

def generate_qc_report(intensity_matrix, sample_info):
    '''Generate comprehensive QC summary'''
    report = {
        'n_samples': intensity_matrix.shape[1],
        'n_proteins': intensity_matrix.shape[0],
        'median_proteins_per_sample': intensity_matrix.notna().sum().median(),
        'overall_missing_pct': 100 * intensity_matrix.isna().sum().sum() / intensity_matrix.size,
        'median_correlation': intensity_matrix.corr().values[np.triu_indices_from(intensity_matrix.corr(), k=1)].mean(),
    }

    # Flags
    report['flags'] = []
    if report['overall_missing_pct'] > 30:
        report['flags'].append('High missing values (>30%)')
    if report['median_correlation'] < 0.9:
        report['flags'].append('Low replicate correlation (<0.9)')

    return report

Related Skills

  • data-import - Load data before QC
  • quantification - Normalization after QC
  • differential-abundance - Analysis after QC passes
  • data-visualization/heatmaps-clustering - QC heatmaps
Install via CLI
npx skills add https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- --skill bio-proteomics-proteomics-qc
Repository Details
star Stars 30
call_split Forks 7
navigation Branch main
article Path SKILL.md
More from Creator
mdbabumiamssm
mdbabumiamssm Explore all skills →