Using juicer to process HIRES data

32 views
Skip to first unread message

sui xiang

unread,
Mar 10, 2025, 11:06:21 PMMar 10
to 3D Genomics
        Hello every guys,Over the past few days I have tried using the following code: cutadapt --discard -- trimmed -- g GGTTGAGGTAGTATTGCGCAATG -- G GGTTGAGGTAGTATTGCGCAATG divides the HIRES data into HIC and scRNA. The HIC is filtered by Fastp and Fastqc and compared with the reference genome of Bowtie2, and then the corresponding .BAM file is obtained by Samtools view, sort and index. After merging .bam files with Sambamba, there is no suitable way to use juicer to process merged.bam. Is there any other way to repeat .BAM file merging between samples and use juicer to handle the next steps of HIC and RNA merging? 
        Best wishes!

All the code below:
parallel_Cutadapt.sh
#!/bin/bash
# 检查是否安装了GNU Parallel
if ! command -v parallel &> /dev/null; then
    echo "GNU Parallel could not be found, please install it first."
    exit 1
fi

# 创建namelist文件,列出所有样本文件夹名称
ls -d */ > namelist

# 定义处理DNA和RNA序列的函数
process_sample() {
    local sample=$1
    # 去除末尾的斜杠
    sample=$(echo "$sample" | sed 's:/*$::')
    echo "${sample} start"
   
    if [ ! -d "${sample}" ]; then
        echo "Directory ${sample} does not exist, skipping."
        return
    fi
   
    cd "${sample}" || { echo "Failed to enter ${sample}"; return; }

    # DNA处理
    [ ! -d "DNA" ] && mkdir -p DNA
    [ ! -d "RNA" ] && mkdir -p RNA
    cutadapt --discard-trimmed -g GGTTGAGGTAGTATTGCGCAATG -G GGTTGAGGTAGTATTGCGCAATG -o "${sample}_DNA_R1.fastq.gz" -p "${sample}_DNA_R2.fastq.gz" *_1.fastq.gz *_2.fastq.gz > "${sample}_cutadapt.DNA.txt"
   
    mv "${sample}_DNA"* DNA/

    # RNA处理
    cutadapt --discard-untrimmed -g GGTTGAGGTAGTATTGCGCAATG -G GGTTGAGGTAGTATTGCGCAATG -o "${sample}_RNA_R1.fastq.gz" -p "${sample}_RNA_R2.fastq.gz" *_1.fastq.gz *_2.fastq.gz > "${sample}_cutadapt.RNA.txt"
    mv "${sample}_RNA"* RNA/

    echo "${sample} finish"
    cd ..
}

# 定义进一步处理DNA序列的函数
process_dna() {
    local sample=$1
    # 去除末尾的斜杠
    sample=$(echo "$sample" | sed 's:/*$::')
    echo "${sample} DNA processing start"
   
    if [ ! -d "${sample}/DNA" ]; then
        echo "Directory ${sample}/DNA does not exist, skipping."
        return
    fi
   
    cd "${sample}/DNA" || { echo "Failed to enter ${sample}/DNA"; return; }
   
    source /data5/tan/zengchuanj/conda/bin/activate
    conda activate HiC-Pro
    fastp -i "${sample}_DNA_R1.fastq.gz" -I "${sample}_DNA_R2.fastq.gz" -o "${sample}_clean.1.fq.gz" -O "${sample}_clean.2.fq.gz" -q 20 -w 16 -n 5
    fastqc -t 10 "${sample}_clean.1.fq.gz" "${sample}_clean.2.fq.gz"
    bowtie2 -p 20 -x /data5/tan/zengchuanj/pipeline/HIC/juicer/references/mm10/bowtie2_index/mm10 -1 "${sample}_clean.1.fq.gz" -2 "${sample}_clean.2.fq.gz" -S "${sample}_out.sam" 2>"${sample}_mapping.txt"
   
    samtools view -bS "${sample}_out.sam" > "${sample}_unsorted.bam"
    samtools sort "${sample}_unsorted.bam" -o "${sample}_sorted.bam"
    # Index sorted BAM
    samtools index "${sample}_sorted.bam"
   

    echo "${sample} DNA processing finish"
    cd ../..
}

# 定义进一步处理RNA序列的函数
process_rna() {
    local sample=$1
    # 去除末尾的斜杠
    sample=$(echo "$sample" | sed 's:/*$::')
    echo "${sample} RNA processing start"
   
    if [ ! -d "${sample}/RNA" ]; then
        echo "Directory ${sample}/RNA does not exist, skipping."
        return
    fi
   
    cd "${sample}/RNA" || { echo "Failed to enter ${sample}/RNA"; continue; }
   
    source /data5/tan/zengchuanj/Software/MACS3/MyPythonEnv/bin/activate
    trim_galore -j 20 --phred33 --gzip --trim-n -o result --paired *.fastq.gz
    cd result
   
    source /data5/tan/zengchuanj/conda/bin/activate
    conda activate HiC-Pro
    fastp -i "${sample}_RNA_R1.fastq.gz" -I "${sample}_RNA_R2.fastq.gz" -o "${sample}_clean.1.fq.gz" -O "${sample}_clean.2.fq.gz" -q 20 -w 16 -n 5
    fastqc -t 10 "${sample}_clean.1.fq.gz" "${sample}_clean.2.fq.gz"
    hisat2 -p 20 -x /data5/tan/zengchuanj/pipeline/HIC/juicer/references/mm10/mm10 -1 ${sample}_RNA.1_val_1.fq.gz -2 ${sample}_RNA.2_val_2.fq.gz 2>"${sample}_hisat.txt" | samtools view -o "${sample}_outname.bam"

    samtools sort -@ 20 -o ${sample}.sort.bam ${sample}_outname.bam
    ##featurecounts定量
    # 使用ensamble的GTF
    echo ${sample} 'Feature counts start'
    ### (ucsc的话 -t exon)
    # 根据gene与exon调整参数
    featureCounts -p --countReadPairs -T 20 -t gene -a /data5/tan/zengchuanj/pipeline/HIC/juicer/references/GTF/Mus_musculus.GRCm38.102.gtf -o ${sample}_count.txt ${sample}.sort.bam
    # 备份生成的文件到样本目录
    cp clean*.fq.gz ../
    cp "../${sample}_hisat.txt" ../
    cp "../${sample}_outname.bam" ../
    cp "../${sample}.sort.bam" ../

    echo "${sample} RNA processing finish"
    cd ../../..
}

# 使用parallel并行处理每个样本,并限制最大线程数为15
export -f process_sample
export -f process_dna
export -f process_rna

# 并行处理DNA和RNA序列
parallel -j 5 process_sample ::: $(cat namelist)

# 并行进一步处理DNA序列
parallel -j 5 process_dna ::: $(cat namelist)

# 并行进一步处理RNA序列
parallel -j 5 process_rna ::: $(cat namelist)

sambamba merge All_merged.bam */DNA/*sorted.bam
samtools sort -n All_merged.bam -o All_sorted.bam
samtools index -@ 20 All_sorted.bam
Reply all
Reply to author
Forward
0 new messages