Hello every guys,Over the past few days I have tried using the following code: cutadapt --discard -- trimmed -- g GGTTGAGGTAGTATTGCGCAATG -- G GGTTGAGGTAGTATTGCGCAATG divides the HIRES data into HIC and scRNA. The HIC is filtered by Fastp and Fastqc and compared with the reference genome of Bowtie2, and then the corresponding .BAM file is obtained by Samtools view, sort and index. After merging .bam files with Sambamba, there is no suitable way to use juicer to process merged.bam. Is there any other way to repeat .BAM file merging between samples and use juicer to handle the next steps of HIC and RNA merging?
Best wishes!
All the code below:
parallel_Cutadapt.sh
#!/bin/bash
# 检查是否安装了GNU Parallel
if ! command -v parallel &> /dev/null; then
echo "GNU Parallel could not be found, please install it first."
exit 1
fi
# 创建namelist文件,列出所有样本文件夹名称
ls -d */ > namelist
# 定义处理DNA和RNA序列的函数
process_sample() {
local sample=$1
# 去除末尾的斜杠
sample=$(echo "$sample" | sed 's:/*$::')
echo "${sample} start"
if [ ! -d "${sample}" ]; then
echo "Directory ${sample} does not exist, skipping."
return
fi
cd "${sample}" || { echo "Failed to enter ${sample}"; return; }
# DNA处理
[ ! -d "DNA" ] && mkdir -p DNA
[ ! -d "RNA" ] && mkdir -p RNA
cutadapt --discard-trimmed -g GGTTGAGGTAGTATTGCGCAATG -G GGTTGAGGTAGTATTGCGCAATG -o "${sample}_DNA_R1.fastq.gz" -p "${sample}_DNA_R2.fastq.gz" *_1.fastq.gz *_2.fastq.gz > "${sample}_cutadapt.DNA.txt"
mv "${sample}_DNA"* DNA/
# RNA处理
cutadapt --discard-untrimmed -g GGTTGAGGTAGTATTGCGCAATG -G GGTTGAGGTAGTATTGCGCAATG -o "${sample}_RNA_R1.fastq.gz" -p "${sample}_RNA_R2.fastq.gz" *_1.fastq.gz *_2.fastq.gz > "${sample}_cutadapt.RNA.txt"
mv "${sample}_RNA"* RNA/
echo "${sample} finish"
cd ..
}
# 定义进一步处理DNA序列的函数
process_dna() {
local sample=$1
# 去除末尾的斜杠
sample=$(echo "$sample" | sed 's:/*$::')
echo "${sample} DNA processing start"
if [ ! -d "${sample}/DNA" ]; then
echo "Directory ${sample}/DNA does not exist, skipping."
return
fi
cd "${sample}/DNA" || { echo "Failed to enter ${sample}/DNA"; return; }
source /data5/tan/zengchuanj/conda/bin/activate
conda activate HiC-Pro
fastp -i "${sample}_DNA_R1.fastq.gz" -I "${sample}_DNA_R2.fastq.gz" -o "${sample}_clean.1.fq.gz" -O "${sample}_clean.2.fq.gz" -q 20 -w 16 -n 5
fastqc -t 10 "${sample}_clean.1.fq.gz" "${sample}_clean.2.fq.gz"
bowtie2 -p 20 -x /data5/tan/zengchuanj/pipeline/HIC/juicer/references/mm10/bowtie2_index/mm10 -1 "${sample}_clean.1.fq.gz" -2 "${sample}_clean.2.fq.gz" -S "${sample}_out.sam" 2>"${sample}_mapping.txt"
samtools view -bS "${sample}_out.sam" > "${sample}_unsorted.bam"
samtools sort "${sample}_unsorted.bam" -o "${sample}_sorted.bam"
# Index sorted BAM
samtools index "${sample}_sorted.bam"
echo "${sample} DNA processing finish"
cd ../..
}
# 定义进一步处理RNA序列的函数
process_rna() {
local sample=$1
# 去除末尾的斜杠
sample=$(echo "$sample" | sed 's:/*$::')
echo "${sample} RNA processing start"
if [ ! -d "${sample}/RNA" ]; then
echo "Directory ${sample}/RNA does not exist, skipping."
return
fi
cd "${sample}/RNA" || { echo "Failed to enter ${sample}/RNA"; continue; }
source /data5/tan/zengchuanj/Software/MACS3/MyPythonEnv/bin/activate
trim_galore -j 20 --phred33 --gzip --trim-n -o result --paired *.fastq.gz
cd result
source /data5/tan/zengchuanj/conda/bin/activate
conda activate HiC-Pro
fastp -i "${sample}_RNA_R1.fastq.gz" -I "${sample}_RNA_R2.fastq.gz" -o "${sample}_clean.1.fq.gz" -O "${sample}_clean.2.fq.gz" -q 20 -w 16 -n 5
fastqc -t 10 "${sample}_clean.1.fq.gz" "${sample}_clean.2.fq.gz"
hisat2 -p 20 -x /data5/tan/zengchuanj/pipeline/HIC/juicer/references/mm10/mm10 -1 ${sample}_RNA.1_val_1.fq.gz -2 ${sample}_RNA.2_val_2.fq.gz 2>"${sample}_hisat.txt" | samtools view -o "${sample}_outname.bam"
samtools sort -@ 20 -o ${sample}.sort.bam ${sample}_outname.bam
##featurecounts定量
# 使用ensamble的GTF
echo ${sample} 'Feature counts start'
### (ucsc的话 -t exon)
# 根据gene与exon调整参数
featureCounts -p --countReadPairs -T 20 -t gene -a /data5/tan/zengchuanj/pipeline/HIC/juicer/references/GTF/Mus_musculus.GRCm38.102.gtf -o ${sample}_count.txt ${sample}.sort.bam
# 备份生成的文件到样本目录
cp clean*.fq.gz ../
cp "../${sample}_hisat.txt" ../
cp "../${sample}_outname.bam" ../
cp "../${sample}.sort.bam" ../
echo "${sample} RNA processing finish"
cd ../../..
}
# 使用parallel并行处理每个样本,并限制最大线程数为15
export -f process_sample
export -f process_dna
export -f process_rna
# 并行处理DNA和RNA序列
parallel -j 5 process_sample ::: $(cat namelist)
# 并行进一步处理DNA序列
parallel -j 5 process_dna ::: $(cat namelist)
# 并行进一步处理RNA序列
parallel -j 5 process_rna ::: $(cat namelist)
sambamba merge All_merged.bam */DNA/*sorted.bam
samtools sort -n All_merged.bam -o All_sorted.bam
samtools index -@ 20 All_sorted.bam