diff --git a/src/input-files.c b/src/input-files.c
index 804d60c..4ba2fca 100644
--- a/src/input-files.c
+++ b/src/input-files.c
@@ -3213,6 +3213,7 @@ int reduce_SAM_to_BAM(SAM_pairer_context_t * pairer , SAM_pairer_thread_t * thre
int is_important_tag = (in_str[in_ptr+0] == 'N' && in_str[in_ptr+1] == 'H') ||
(in_str[in_ptr+0] == 'H' && in_str[in_ptr+1] == 'I') ||
(in_str[in_ptr+0] == 'R' && in_str[in_ptr+1] == 'G') ||
+ (in_str[in_ptr+0] == 'C' && in_str[in_ptr+1] == 'B') ||
(in_str[in_ptr+0] == 'N' && in_str[in_ptr+1] == 'M') ;
int xxnch;
if(in_str[in_ptr + 3] == 'Z' || in_str[in_ptr + 3] == 'H'){
@@ -3558,7 +3559,7 @@ void SAM_pairer_make_dummy(char * rname, char * bin1, char * out_bin2, int need_
SAM_pairer_iterate_int_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "HI", &HItag);
if( need_RG_tag ){
char RG_type=0;
- SAM_pairer_iterate_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "RG", &RG_type, &RG_tag_val);
+ SAM_pairer_iterate_tags((unsigned char *)(bin1+bin1ptr),block1len + 4 - bin1ptr, "CB", &RG_type, &RG_tag_val);
if(RG_type != 'Z') RG_tag_val = NULL;
//SUBREADprintf("type=%c\tval=%s\n", RG_type, RG_tag_val);
}
@@ -3639,8 +3640,8 @@ void SAM_pairer_make_dummy(char * rname, char * bin1, char * out_bin2, int need_
}
}
if(RG_tag_val){
- out_bin2[tag_ptr++]='R';
- out_bin2[tag_ptr++]='G';
+ out_bin2[tag_ptr++]='C';
+ out_bin2[tag_ptr++]='B';
out_bin2[tag_ptr++]='Z';
all_len +=3;
while(*RG_tag_val){
diff --git a/src/readSummary.c b/src/readSummary.c
index 9aab699..80d69b0 100644
--- a/src/readSummary.c
+++ b/src/readSummary.c
@@ -1832,7 +1832,7 @@ int process_pairer_header (void * pairer_vp, int thread_no, int is_text, unsigne
int rcursor=0;
for(;rcursor
add_CB_tags () {
# Add "@CB ID:" for each barcode listed in
# cell_barcodes_file to the BAM header (stdin).
#
# This function is supposed to be called like this:
#
# samtools reheader -c "add_CB_tags '${cell_barcodes_file}'"
local cell_barcodes_file="${1}";
mawk \
-F '\t' \
-v cell_barcodes_file="${cell_barcodes_file}" '
BEGIN {
# Read barcodes in the cell_barcode_array array.
while ( (getline < cell_barcodes_file) > 0 ) {
cell_barcode_idx += 1;
# Save all cell barcodes:
cell_barcode_array[cell_barcode_idx] = $0;
}
}
{
if ($1 ~ /^@/) {
if ($1 !~ /^@CB/) {
# Print BAM header line (if it is not a "CB" tag header line).
print $0;
}
}
} END {
# Print a header line for each cell barcode after the original header.
for (cell_barcode_idx=1; cell_barcode_idx <= length(cell_barcode_array); cell_barcode_idx++) {
print "@CB\tID:" cell_barcode_array[cell_barcode_idx];
}
}' -
}
export -f add_CB_tags
create_single_cell_bam_file_for_feature_counts_with_selected_cell_barcodes_only () {
# Create output single-cell BAM file from input single-cell BAM file
# with only reads from selected cell barcodes.
if [ ${#@} -ne 4 ] ; then
printf 'Usage:\n';
printf ' create_single_cell_bam_file_for_feature_counts_with_selected_cell_barcodes_only \\\n';
printf ' single_cell_input_bam_file \\\n';
printf ' single_cell_output_bam_file \\\n';
printf ' cell_barcodes_file \\\n';
printf ' sort_by_read_name\n\n';
printf 'Parameters:\n';
printf ' - single_cell_input_bam_file:\n';
printf ' Single-cell BAM file (e.g.: CellRanger output: "possorted_bam.bam") from\n';
printf ' which only reads which match cell barcodes listed in the barcodes file\n';
printf ' will be extracted.\n';
printf ' - single_cell_output_bam_file:\n';
printf ' Output read sorted or position sorted BAM file to which only reads with\n';
printf ' cell barcodes listed in the cell barcodes file will be written. This BAM\n';
printf ' file will have "CB" tags added to the header for each cell barcode so it\n';
printf ' can be used as input for featureCounts.\n';
printf ' - cell_barcodes_file:\n';
printf ' File with cell barcodes (e.g.: CellRanger output: "barcodes.tsv").\n';
printf ' - sort_order:\n';
printf ' - 0: "ns": do not resort\n';
printf ' - 1: "rn": by read name\n';
printf ' - 2: "cp": by chromosomal position\n';
printf ' - 3: "cbrn": by cell barcode and secondary by read name\n';
printf ' - 4: "cbcp": by cell barcode and secondary by chromosomal position\n\n';
printf 'Purpose:\n';
printf ' Create output single-cell BAM file from input single-cell BAM file with only\n';
printf ' reads from selected cell barcodes.\n\n';
return 1;
fi
local single_cell_input_bam_file="${1}";
local single_cell_output_bam_file="${2}";
local cell_barcodes_file="${3}";
local sort_order="${4}";
local exit_code=0;
# SAMtools sort order related arguments.
local samtools_sort_order_args='';
local -i samtools_no_sort=0;
# Check if reads in output BAM file should be sorted by read name.
case "${sort_order}" in
# Do not resort.
'0') samtools_no_sort=1;;
'ns') samtools_no_sort=1;;
# Sort by read name.
'1') samtools_sort_order_args='-n';;
'rn') samtools_sort_order_args='-n';;
# Sort by chromosomal position.
'2') samtools_sort_order_args='';;
'cp') samtools_sort_order_args='';;
# Sort by cell barcode and secondary by read name.
'3') samtools_sort_order_args='-t CB -n';;
'cbrn') samtools_sort_order_args='-t CB -n';;
# Sort by cell barcode and secondary by chromosomal position.
'4') samtools_sort_order_args='-t CB';;
'cbcp') samtools_sort_order_args='-t CB';;
*) printf 'Error: Sort order "%s" is not a valid option.\n' "${sort_order}";
return 1;;
esac
if [ ${samtools_no_sort} -eq 1 ] ; then
# Extract only reads with barcodes from barcodes file.
# - Only keep reads with cell barcodes of interests.
# -D: "Specify barcode tag":"Only include reads with barcode listed in FILE [null]"
# - Add "@CB ID:" for each barcode to the BAM header.
# - Sort reads by read name or by position.
samtools view \
-b \
-O BAM \
-@ "${samtools_nbr_threads}" \
-D "CB:${cell_barcodes_file}" \
"${single_cell_input_bam_file}" \
| samtools reheader -c "add_CB_tags '${cell_barcodes_file}'" - \
> "${single_cell_output_bam_file}";
else
# Extract only reads with barcodes from barcodes file.
# - Only keep reads with cell barcodes of interests.
# -D: "Specify barcode tag":"Only include reads with barcode listed in FILE [null]"
# - Add "@CB ID:" for each barcode to the BAM header.
# - Sort reads by read name or by position.
samtools view \
-u \
-O BAM \
-@ "${samtools_nbr_threads}" \
-D "CB:${cell_barcodes_file}" \
"${single_cell_input_bam_file}" \
| samtools reheader -c "add_CB_tags '${cell_barcodes_file}'" - \
| samtools sort \
${samtools_sort_order_args} \
-O BAM \
-m "${samtools_sort_memory}" \
-@ "${samtools_nbr_threads}" \
-o "${single_cell_output_bam_file}";
fi
# Check if any of the piped commands failed.
for exit_code in "${PIPESTATUS[@]}" ; do
if [ "${exit_code}" -ne 0 ] ; then
return ${exit_code};
fi
done
if [ "${sort_order}" = "cp" ] ; then
# Index BAM file if it is sorted by chromosomal position.
samtools index "${single_cell_output_bam_file}";
return $?;
fi
return 0;
}
create_single_cell_bam_file_for_feature_counts_with_selected_cell_barcodes_only
for specific cellbarcodes and add @CB tags to the BAM header, so the RG code will work with it.
run_feature_counts_on_single_cell_bam_file () {
if [ ${#@} -ne 3 ] ; then
printf 'Usage:\n';
printf ' run_feature_counts_on_single_cell_bam_file \\\n';
printf ' single_cell_input_bam_file \\\n';
printf ' peaks_gff_file|peaks_saf_file \\\n';
printf ' feature_counts_output_file\n\n';
printf 'Parameters:\n';
printf ' - single_cell_input_bam_file:\n';
printf ' Single-cell input BAM file (preferably sorted by read name for\n';
printf ' paired-end) with only reads for cell barcodes you want to\n';
printf ' generate counts (per feature) for.\n';
printf ' - peaks_gff_filename:\n';
printf ' Peak file in GFF format which contains the regions for which you\n';
printf ' want to count the number of reads per feature per barcode.\n';
printf ' - feature_counts_output_file:\n';
printf ' featureCounts output file.\n\n';
printf 'Purpose:\n';
printf ' Run featureCounts on single-cell BAM file and count features for each\n';
printf ' cell barcode separately.\n\n';
return 1;
fi
local single_cell_input_bam_file="${1}";
local peaks_gff_or_peaks_saf_file="${2}";
local feature_counts_output_file="${3}";
local annotation_settings="";
if [ "${peaks_gff_or_peaks_saf_file%.saf}" != "${peaks_gff_or_peaks_saf_file}" ] ; then
annotation_settings='-F SAF';
else
annotation_settings='-F GTF -t Region -g region';
fi
featureCounts \
-T 4 \
--byReadGroup \
-s 0 \
-Q 0 \
-p \
--ignoreDup \
${annotation_settings} \
-a "${peaks_gff_or_peaks_saf_file}" \
-o "${feature_counts_output_file}" \
"${single_cell_input_bam_file}";
}
run_feature_counts_on_single_cell_bam_file "${@}";
run_feature_counts_on_single_cell_bam_file on the cell barcodes filtered BAM file and
generate a matrix with counts for each gene/region for each cell barcode.
Thanks,
Gert