Source code for deside.bulk_cell.preprocessing

import os
import gzip
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from ..utility import print_df, center_value, log2_transform, check_dir, get_sep


def get_data_from_firehose_raw(file_path, data_type):
    """
    get data from raw data file which is downloaded from firehose
    http://gdac.broadinstitute.org/
    :param file_path:
    :param data_type: raw_count	or scaled_estimate
    :return: a dataframe
    """
    print('>>> Start to read bulk expression file...')
    bulk_raw_data = pd.read_csv(file_path, sep='\t', header=[0, 1], index_col=0)
    idx = pd.IndexSlice
    if data_type not in ['raw_count', 'scaled_estimate']:
        raise TypeError('data_type should be raw_count or scaled_estimate')
    bulk_exp = bulk_raw_data.loc[:, idx[:, data_type]]  # genes by samples
    if data_type == 'scaled_estimate':
        bulk_exp = bulk_exp * 1e6
    bulk_exp.columns = bulk_exp.columns.get_level_values(0)
    # print_df(bulk_exp)
    return bulk_exp


def get_gene_len(file_path):
    """
    get gene length (only exon)
    gene info file downloaded from https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files
    - file TCGA.hg19.June2011.gaf
    :param file_path:
    :return:
    """
    gene_info = pd.read_csv(file_path, sep='\t', index_col='FeatureID')
    gene_info['GeneLen'] = gene_info['FeatureCoordinates'].map(lambda x: int(x.split(',')[-1].split('-')[-1]))
    return gene_info.loc[:, ['GeneLen']]


def normalize_by_gene_len(bulk_exp, gene_len):
    """
    normalize bulk expression by gene length
    :param bulk_exp: a data frame contains all expression data of each sample
    :param gene_len: a data frame contains gene length information, 'GeneLen' is a column, bp
    :return: normalized bulk expression
    """
    # common_genes = list(set(bulk_exp.index.to_list()) & set(gene_len.index.to_list()))
    common_genes = [i for i in bulk_exp.index if i in gene_len.index]
    bulk_exp = bulk_exp.loc[common_genes, :]
    gene_len = gene_len.loc[common_genes, :]
    bulk_norm_by_gene_len = bulk_exp / np.vstack(gene_len['GeneLen']) * 1000
    return bulk_norm_by_gene_len


def split_cancer_normal_samples(bulk_exp_fp, sample_info_fp):
    """
    split samples by sample information to cancer group and normal group
    :param bulk_exp_fp:
    :param sample_info_fp: get from http://gdac.broadinstitute.org/runs/stddata__latest/samples_report/HNSC.html
    :return: dict
           only cancer and normal tissue bulk expression data
    """
    bulk_exp = pd.read_csv(bulk_exp_fp, index_col=0)
    sample_info = pd.read_csv(sample_info_fp, sep='\t')
    sample_info = sample_info.loc[sample_info['Protocol'] == 'RSEM_genes', :].copy()
    if '.' in bulk_exp.columns[0]:  # replaced '-' with '.' by R
        bulk_exp.columns = [i.replace('.', '-') for i in bulk_exp.columns]
    bulk_exp.index = bulk_exp.index.map(lambda x: x.split('|')[0] if x[0] != '?' else x)  # split and get gene name
    # print_df(bulk_exp)
    bulk_exp_normal_tissue = bulk_exp.loc[:, sample_info.loc[sample_info['sample type'] ==
                                                             'Solid Tissue Normal', 'TCGA Barcode']]
    # only keep Primary Solid Tumor
    bulk_exp_cancer = bulk_exp.loc[:, sample_info.loc[sample_info['sample type'].isin(['Primary Solid Tumor']),
                                                      'TCGA Barcode']]
    bulk_exp_cancer = bulk_exp_cancer.loc[~bulk_exp_cancer.index.duplicated(keep='first')]
    bulk_exp_normal_tissue = bulk_exp_normal_tissue.loc[~bulk_exp_normal_tissue.index.duplicated(keep='first')]
    return {'cancer': bulk_exp_cancer, 'normal_tissue': bulk_exp_normal_tissue}


def get_sample2subtype(bulk_exp, subtype_info_fp):
    """
    match subtype by the supplementary materials of HNSC marker paper
    set the type of samples that don't include in marker paper as 'New'
    :param bulk_exp: bulk expression dataframe, only use sample names in this bulk expression dataframe
    :param subtype_info_fp: a file comes from the supplementary materials of HNSC marker paper
    :return:
    """
    subtype = pd.read_excel(subtype_info_fp, index_col=0)
    subtype.index = subtype.index.map(lambda x: x.replace('.', '-'))
    sample2subtype = {}
    for s in bulk_exp.columns.to_list():
        _s = s[:12]
        if _s in subtype.index:
            _st = subtype.loc[_s, 'RNA']
        else:
            _st = 'New'
        sample2subtype[s] = _st
    sample2subtype_df = pd.DataFrame.from_dict(sample2subtype, orient='index')
    # print('  >>> Shape of sample2subtype: {}'.format(sample2subtype_df.shape))
    sample2subtype_df.rename(columns={0: 'subtype'}, inplace=True)
    return sample2subtype_df


def filter_by_corr_with_subclass(bulk_exp, subtype, corr_threshold=0.1):
    """
    filter samples by correlation with each subclass in marker paper

    :param bulk_exp: pd.DataFrame
        bulk cell expression value normalized by TMM (edgeR), don't need to log transform and center
    :param subtype: pd.DataFrame
        selected typical samples for each subtype, a dataframe with
                     ['reordered_ind', 'cluster_id', 'subtype', 'reclass'] as columns, and 'sample_name' as index
                     this information comes from the SI of marker paper
                     Nature 517, 576–582 (2015). https://doi.org/10.1038/nature14129
    :param corr_threshold:
    :return:
    """
    # df_c = df_c.loc[:, ~df_c.index.isin(subtype.index)].copy()
    mean_corr2each_subtype = pd.DataFrame(index=bulk_exp.columns)
    # print_df(df_c)
    _bulk_exp = bulk_exp.copy()
    bulk_exp = log2_transform(bulk_exp)
    bulk_exp = center_value(bulk_exp)
    sample_corr = bulk_exp.corr()
    for _st in subtype['subtype'].unique():
        if _st != 'New':
            print('current subtype: {}'.format(_st))
            sample_corr2_with_st = sample_corr.loc[:, subtype[subtype['subtype'] == _st].index].copy()
            print_df(sample_corr2_with_st)
            mean_corr2each_subtype['mean_corr2{}'.format(_st.lower()[:3])] = sample_corr2_with_st.mean(axis=1)
    samples_filtered = _bulk_exp.loc[:, np.sum(mean_corr2each_subtype.values >= corr_threshold, axis=1) == 1]
    return samples_filtered


def unzip_and_merge(sample_info_file_path: str, gdc_download_dir: str,
                    result_dir: str, log_file_path: str,
                    file_type='htseq.counts'):
    """
    Unzip and merge files downloaded from https://portal.gdc.cancer.gov/ by cancer types (Project ID).
        And only keep "Primary Tumor" (one of Sample Type).

    :param sample_info_file_path: gdc_sample_sheet, downloaded from GDC.
        A .tsv file contains File ID, File Name, Data, Category, Data Type, Project ID, Case ID, Sample ID, Sample Type

    :param gdc_download_dir: file folder contains downloaded data from GDC
        One sample one .htseq.counts.gz file.
    :param result_dir: where to save merged files

    :param log_file_path: log file path

    :param file_type: file type downloaded from GDC website

    :return: None
    """
    print('Start to unzip downloaded files...')
    check_dir(result_dir)
    sample_info = pd.read_csv(sample_info_file_path, sep='\t')
    # only keep primary tumor
    sample_info = sample_info.loc[sample_info['Sample Type'] == 'Primary Tumor', :]
    cancer_type2file_name = {}
    file_name_counter = {}
    file_name2new_path = {}
    for _, row in tqdm(sample_info.iterrows()):
        cancer_type = row['Project ID'].split('-')[1]
        dir1 = row['File ID']
        file_name = row['File Name']
        file_path = os.path.join(gdc_download_dir, dir1, file_name)

        if os.path.exists(file_path):
            new_file_dir = os.path.join(result_dir, cancer_type)
            check_dir(new_file_dir)
            new_file_name = row['Sample ID'] + f'.{file_type}'
            if new_file_name not in file_name_counter:
                file_name_counter[new_file_name] = 0
            file_name_counter[new_file_name] += 1
            new_file_path = os.path.join(new_file_dir, new_file_name)
            file_name2new_path[new_file_name] = new_file_path
            if not os.path.exists(new_file_path):
                with gzip.open(file_path, 'rb') as f_in:
                    with open(new_file_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
            if cancer_type not in cancer_type2file_name:
                cancer_type2file_name[cancer_type] = []
            cancer_type2file_name[cancer_type].append(new_file_name)
        else:
            with open(log_file_path, 'a') as f_log:
                f_log.write(f'File path not found: {file_path}' + '\n')
    for file_name, count in file_name_counter.items():
        if count > 1:  # remove duplicate files (duplicate Sample ID)
            if os.path.exists(file_name2new_path[file_name]):
                os.remove(file_name2new_path[file_name])

    # merge
    print('Start to merge files by cancer type...')
    for cancer_type, file_names in cancer_type2file_name.items():
        merged_result_file_path = os.path.join(result_dir, cancer_type, f'merged_{cancer_type}_{file_type}.csv')
        if not os.path.exists(merged_result_file_path):
            current_files = []
            for fn in file_names:
                if file_name_counter[fn] == 1:
                    current_df = pd.read_csv(file_name2new_path[fn], index_col=0, header=None, sep='\t')
                    current_df.columns = [fn.split('.')[0]]
                    current_df.index.name = 'gene_id'
                    current_files.append(current_df)
            merged_df = pd.concat(current_files, axis=1)
            print(f'    {cancer_type}: {merged_df.shape}')
            merged_df.to_csv(merged_result_file_path)


[docs]def read_counts2tpm(read_counts_file_path: str, annotation_file_path: str, result_dir: str, file_type='htseq.counts', file_name_prefix: str = ''): """ Convert read counts (htseq.counts) to TPM (transcript per million) :param read_counts_file_path: the file path of merged read counts file (.htseq.counts), separated by tab or comma, gene by sample :param result_dir: the folder of saving result files :param annotation_file_path: file path of gencode.gene.info.v22.tsv download from https://api.gdc.cancer.gov/data/b011ee3e-14d8-4a97-aed4-e0b10f6bbe82 or other annotation files, gene_type, gene_name and exon_length should be included :param file_type: htseq.counts, raw data type downloaded from https://portal.gdc.cancer.gov/ :param file_name_prefix: prefix of result file, only for naming :return: None """ print(f' Start to convert {file_type} to TPM...') # current_result_dir = os.path.dirname(result_dir) check_dir(result_dir) sep = get_sep(read_counts_file_path) read_counts = pd.read_csv(read_counts_file_path, index_col=0, sep=sep) index_type = 'gene_name' if 'ENSG' in read_counts.index[0]: index_type = 'gene_id' sep2 = get_sep(annotation_file_path, comment='#') anno = pd.read_csv(annotation_file_path, index_col=index_type, sep=sep2, comment='#') if index_type == 'gene_name': anno['gene_name'] = anno.index anno = anno.loc[anno['gene_type'] == 'protein_coding', ['gene_name', 'exon_length']] # only keep protein coding genes and convert Ensembl gene id to gene symbol filtered_read_counts_file_path = os.path.join(result_dir, f'{file_name_prefix}_{file_type}.csv') tpm_file_path = os.path.join(result_dir, f'{file_name_prefix}_TPM.csv') log2_trans_file_path = os.path.join(result_dir, f'{file_name_prefix}_log2tpm1p.csv') if not os.path.exists(tpm_file_path): merged_file = anno.merge(read_counts, left_index=True, right_index=True) merged_file.set_index('gene_name', inplace=True) duplicated_inx_bool = merged_file.index.duplicated(keep='first') if np.any(duplicated_inx_bool): print(' Remove duplicated genes by keeping the first row...') merged_file = merged_file[~duplicated_inx_bool] merged_file.iloc[:, 1:].to_csv(filtered_read_counts_file_path, float_format='%.3f') norm_by_gene_len = merged_file.iloc[:, 1:] / np.vstack(merged_file['exon_length']) * 1000 tpm = norm_by_gene_len / np.hstack(norm_by_gene_len.sum(axis=0)) * 1e6 n, m = tpm.shape # gene by sample print(' There are {} genes and {} samples.'.format(n, m)) tpm.to_csv(tpm_file_path, float_format='%.3f') tpm_t = tpm.T log2tpm1p = log2_transform(tpm_t) log2tpm1p.to_csv(log2_trans_file_path, float_format='%.3f') print(' Converting finished.') else: print(' Previous result has existed in file {}.'.format(tpm_file_path))