{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# CAL51_chiprep - chip_seq\n", "This notebook will create all the necessary files, scripts and folders to pre-process the aforementioned project. Is designed to be used in a jupyter server deployed in a system running SLURM. The majority of the scripts and heavy-lifting processes are wrapped up in sbatch scripts.As an end user, in order to pre-process your samples provided in the spread sheet, you will simply need to *run the entire notebook* (Cell > Run all) and the system should take care of the rest for you.\n", "#### Create necessary folder(s)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir -p /data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata\n", "mkdir -p /data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/raw_reads\n", "mkdir -p /data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/processed_raw_reads\n", "mkdir -p /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts\n", "mkdir -p /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/jsons\n", "mkdir -p /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Save metadata file" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata/chip_seq_download_metadata.CAL51_chiprep.txt\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata/chip_seq_download_metadata.CAL51_chiprep.txt\n", "Sequencing core user\tSequencing core project\tSequencing core password\tSequencing core library name\tName\tPaired-end or single-end\tGenome\tLibrary type\tControl\tStrand specificity\tBlacklist removal\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tA549-NoDex-p300_PC\tA549.p300.dex.00h.pos_ctrl\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-DMSO-H3K27Ac-rep2\tCAL51.H3K27Ac.DMSO.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.DMSO.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-DMSO-H3K27Ac-rep3\tCAL51.H3K27Ac.DMSO.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.DMSO.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-DMSO-p300-rep2\tCAL51.p300.DMSO.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.DMSO.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-DMSO-p300-rep3\tCAL51.p300.DMSO.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.DMSO.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-DMSO-rep1-Ipctrl\tCAL51.ctrl.DMSO.rep1\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-DMSO-rep2-Ipctrl\tCAL51.ctrl.DMSO.rep2\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-DMSO-rep3-Ipctrl\tCAL51.ctrl.DMSO.rep3\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-G_T-H3K27Ac-rep2\tCAL51.H3K27Ac.GT.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.GT.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-G_T-H3K27Ac-rep3\tCAL51.H3K27Ac.GT.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.GT.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-G_T-p300-rep2\tCAL51.p300.GT.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.GT.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-G_T-p300-rep3\tCAL51.p300.GT.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.GT.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-G_T-rep1-Ipctrl\tCAL51.ctrl.GT.rep1\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-G_T-rep2-Ipctrl\tCAL51.ctrl.GT.rep2\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-G_T-rep3-Ipctrl\tCAL51.ctrl.GT.rep3\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Gefit-H3K27Ac-rep2\tCAL51.H3K27Ac.Gefit.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.Gefit.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Gefit-H3K27Ac-rep3\tCAL51.H3K27Ac.Gefit.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.Gefit.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Gefit-p300-rep2\tCAL51.p300.Gefit.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.Gefit.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Gefit-p300-rep3\tCAL51.p300.Gefit.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.Gefit.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Gefit-rep1-Ipctrl\tCAL51.ctrl.Gefit.rep1\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Gefit-rep2-Ipctrl\tCAL51.ctrl.Gefit.rep2\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Gefit-rep3-Ipctrl\tCAL51.ctrl.Gefit.rep3\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-THZ531-H3K27Ac-rep2\tCAL51.H3K27Ac.THZ531.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.THZ531.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-THZ531-H3K27Ac-rep3\tCAL51.H3K27Ac.THZ531.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.THZ531.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-THZ531-p300-rep2\tCAL51.p300.THZ531.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.THZ531.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-THZ531-p300-rep3\tCAL51.p300.THZ531.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.THZ531.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-THZ531-rep1-Ipctrl\tCAL51.ctrl.THZ531.rep1\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-THZ531-rep2-Ipctrl\tCAL51.ctrl.THZ531.rep2\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-THZ531-rep3-Ipctrl\tCAL51.ctrl.THZ531.rep3\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Vorino-H3K27Ac-rep1\tCAL51.H3K27Ac.Vorino.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.Vorino.rep1\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Vorino-H3K27Ac-rep2\tCAL51.H3K27Ac.Vorino.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.Vorino.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Vorino-H3K27Ac-rep3\tCAL51.H3K27Ac.Vorino.rep3\tSE\thg38\tChIP-seq\tCAL51.ctrl.Vorino.rep3\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Vorino-p300-rep1\tCAL51.p300.Vorino.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.Vorino.rep1\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Vorino-p300-rep2\tCAL51.p300.Vorino.rep2\tSE\thg38\tChIP-seq\tCAL51.ctrl.Vorino.rep2\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Vorinostat-rep1-Ipctrl\tCAL51.ctrl.Vorino.rep1\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Vorinostat-rep2-Ipctrl\tCAL51.ctrl.Vorino.rep2\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180911A1\t8nysUY9Rxike\tCAL51-Vorinostat-rep3-Ipctrl\tCAL51.ctrl.Vorino.rep3\tSE\thg38\tChIP-seq\t\t\t\n", "ang_5164\tANG_5164_180906A1\t8nysUY9Rxike\tCAL51-G_T-p300-rep1\tCAL51.p300.GT.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.GT.rep1\t\t\n", "ang_5164\tANG_5164_180906A1\t8nysUY9Rxike\tCAL51-THZ531-H3K27Ac-rep1\tCAL51.H3K27Ac.THZ531.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.THZ531.rep1\t\t\n", "ang_5164\tANG_5164_180906A1\t8nysUY9Rxike\tCAL51-DMSO-H3K27Ac-rep1\tCAL51.H3K27Ac.DMSO.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.DMSO.rep1\t\t\n", "ang_5164\tANG_5164_180906A1\t8nysUY9Rxike\tCAL51-Gefit-H3K27Ac-rep1\tCAL51.H3K27Ac.Gefit.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.Gefit.rep1\t\t\n", "ang_5164\tANG_5164_180906A1\t8nysUY9Rxike\tCAL51-G_T-H3K27Ac-rep1\tCAL51.H3K27Ac.GT.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.GT.rep1\t\t\n", "ang_5164\tANG_5164_180906A1\t8nysUY9Rxike\tCAL51-DMSO-p300-rep1\tCAL51.p300.DMSO.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.DMSO.rep1\t\t\n", "ang_5164\tANG_5164_180906A1\t8nysUY9Rxike\tCAL51-THZ531-p300-rep1\tCAL51.p300.THZ531.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.THZ531.rep1\t\t\n", "ang_5164\tANG_5164_180906A1\t8nysUY9Rxike\tCAL51-Gefit-p300-rep1\tCAL51.p300.Gefit.rep1\tSE\thg38\tChIP-seq\tCAL51.ctrl.Gefit.rep1\t\t\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download FASTQ from sftp\n", "Create file to download FASTQ files" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/download_CAL51_chiprep.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/download_CAL51_chiprep.sh\n", "#!/bin/bash\n", "METADATA=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata/chip_seq_download_metadata.CAL51_chiprep.txt\n", "DATA_HOME=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq\n", "mkdir -p ${DATA_HOME}/raw_reads/\n", "\n", "read CORE_USER CORE_PROJECT CORE_PASS <<< $(cut -f 1,2,3 $METADATA | tail -n+2 | sort -u | head -n 1)\n", "cd ${DATA_HOME}/raw_reads/\n", "lftp -u ${CORE_USER},${CORE_PASS} -e \"mirror --parallel=16 --only-missing ${CORE_USER}/${CORE_PROJECT}; exit\" \"sftp://${CORE_USER}@dnaseq2.igsp.duke.edu\"\n", "\n", "read CORE_USER CORE_PROJECT CORE_PASS <<< $(cut -f 1,2,3 $METADATA | tail -n+2 | sort -u | tail -n 1)\n", "cd ${DATA_HOME}/raw_reads/\n", "lftp -u ${CORE_USER},${CORE_PASS} -e \"mirror --parallel=16 --only-missing ${CORE_USER}/${CORE_PROJECT}; exit\" \"sftp://${CORE_USER}@dnaseq2.igsp.duke.edu\"\n", "\n", "exit\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execute file to download files" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash\n", "sbatch -o /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/CAL51_chiprep_download_fastq_files.out \\\n", " -p new,all \\\n", " --wrap=\"ssh xa2@Hardac-xfer.genome.duke.edu 'sh /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/download_CAL51_chiprep.sh' \"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Ungzip FASTQ files" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/ungzip_CAL51_chiprep.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/ungzip_CAL51_chiprep.sh\n", "#!/bin/bash\n", "ORDER=CAL51_chiprep\n", "PROJECT_HOME=/data/reddylab/Hazel/chipseq/chiprep\n", "METADATA=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata/chip_seq_download_metadata.CAL51_chiprep.txt\n", "RAW_DATA_DIR=${PROJECT_HOME}/data/chip_seq/raw_reads/${ORDER}\n", "\n", "seq_name_header=$(/bin/grep -Eoi \"sequencing.?core.?library.?name\" ${METADATA})\n", "if [[ $? == 1 ]];\n", "then\n", " echo -e \"ERROR: Sequencing core library name not found in ${METADATA}. Fix it\"\n", " exit 1\n", "fi\n", "\n", "if [[ ! -e ${RAW_DATA_DIR} ]];\n", "then\n", " RAW_DATA_DIR=${PROJECT_HOME}/data/chip_seq/raw_reads/${ORDER^^}\n", "fi\n", "PROCESSED_DATA_DIR=${PROJECT_HOME}/data/chip_seq/processed_raw_reads/${ORDER}\n", "\n", "mkdir -p ${PROCESSED_DATA_DIR}\n", "\n", "sample=$(/data/reddylab/software/bin/print_tab_cols.awk -v cols=\"${seq_name_header}\" ${METADATA} |\\\n", " awk -v SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID} 'NR==SLURM_ARRAY_TASK_ID+1{print}');\n", "\n", "for ii in $(/bin/find ${RAW_DATA_DIR} -type f | /bin/grep \"${sample/ /}_.*.gz\");\n", "do\n", " sample_gz_file=$(basename ${ii})\n", " if [[ ! -e ${PROCESSED_DATA_DIR}/${sample_gz_file/.gz/} ]];\n", " then\n", " gzip -cd ${ii} > ${PROCESSED_DATA_DIR}/${sample_gz_file/.gz/};\n", " fi\n", "done" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execute file to ungzip FASTQ files" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "sbatch -o /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/CAL51_chiprep_ungzip_fastq_files_%a.out \\\n", " -p new,all \\\n", " --array 0-44%20 \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/ungzip_CAL51_chiprep.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Merge lanes of FASTQ files" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/merge_lanes_CAL51_chiprep.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/merge_lanes_CAL51_chiprep.sh\n", "#!/bin/bash\n", "#SBATCH --array=0-45%20\n", "ORDER=CAL51_chiprep\n", "PROCESSED_DATA_DIR=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/processed_raw_reads/${ORDER}\n", "METADATA=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata/chip_seq_download_metadata.CAL51_chiprep.txt\n", "\n", "mkdir -p ${PROCESSED_DATA_DIR}\n", "cd ${PROCESSED_DATA_DIR}\n", "\n", "seq_name_header=$(/bin/grep -Eoi \"sequencing.?core.?library.?name\" ${METADATA})\n", "if [[ $? == 1 ]];\n", "then\n", " echo -e \"ERROR: Sequencing core library name not found in ${METADATA}\"\n", " exit 1\n", "fi\n", "\n", "name_header=$(/bin/grep -Poi \"\\tname\\t\" ${METADATA})\n", "if [[ $? == 1 ]];\n", "then\n", " echo -e \"ERROR: Library Name column not found in ${METADATA}\"\n", " exit 1\n", "fi\n", "name_header=$(echo ${name_header} | cut -f2)\n", "\n", "seq_type_header=$(head -1 ${METADATA} | /bin/grep -Poi \"paired.?end.?or.?single.?end\")\n", "if [[ $? == 1 ]];\n", "then\n", " echo -e \"ERROR: Paired-end or single-end column not found in ${METADATA}\"\n", " exit 1\n", "fi\n", "\n", "sample_seq_name=$(/data/reddylab/software/bin/print_tab_cols.awk -v cols=\"${seq_name_header}\" ${METADATA} \\\n", " | awk -v SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID} 'NR==SLURM_ARRAY_TASK_ID+1{print}');\n", "sample_name=$(/data/reddylab/software/bin/print_tab_cols.awk -v cols=\"${name_header}\" ${METADATA} \\\n", " | awk -v SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID} 'NR==SLURM_ARRAY_TASK_ID+1{print}');\n", "seq_type=$(/data/reddylab/software/bin/print_tab_cols.awk -v cols=\"${seq_type_header}\" ${METADATA} \\\n", " | awk -v SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID} 'NR==SLURM_ARRAY_TASK_ID+1{print}');\n", "\n", "\n", "for read_pair in R1 R2;\n", "do\n", " sample_files=$(/bin/ls ${sample_seq_name/ /}_S[0-9]*_L[0-9][0-9][0-9]_${read_pair}_*)\n", " if [[ ${read_pair} == \"R1\" || (${seq_type/ /} == \"PE\" || ${seq_type/ /} == \"pe\") ]];\n", " then\n", " # Merge all lanes\n", " merged=$(echo ${sample_files} | awk '{print $1}' | sed -e 's/_L[0-9]\\{3\\}_/_/')\n", " cat ${sample_files} > ${merged};\n", " rm -f ${sample_files}\n", "\n", " # Rename samples with our sample Names\n", " dest_filename=$(basename ${merged} | sed -r 's/\\_S[0-9]+//; s/\\_(R[12])\\_/\\.\\1\\./; s/\\.[0-9]+\\.fastq/\\.fastq/')\n", " mv ${merged} ${dest_filename}\n", "\n", " cleaned_dest_filename=${dest_filename/${sample_seq_name/ /}/${sample_name/ /}}\n", "\n", " if [[ ${seq_type/ /} == \"SE\" || ${seq_type/ /} == \"se\" ]];\n", " then\n", " cleaned_dest_filename=${cleaned_dest_filename/.R1/}\n", " fi\n", " \n", " mv ${dest_filename} ${cleaned_dest_filename}\n", " fi\n", "done\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execute file to merge lanes of FASTQ files" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "sbatch -o /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/CAL51_chiprep_merge_fastq_files_%a.out \\\n", " -p new,all \\\n", " --depend afterok:$1 \\\n", " --array 0-44%20 \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/merge_lanes_CAL51_chiprep.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create JSON files for CWL pipeline files" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/cwl_json_gen_CAL51_chiprep.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/cwl_json_gen_CAL51_chiprep.sh\n", "#!/bin/bash\n", "ORDER=CAL51_chiprep\n", "PROCESSED_DATA_DIR=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/processed_raw_reads/${ORDER}\n", "METADATA=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata/chip_seq_download_metadata.CAL51_chiprep.txt\n", "\n", "python /data/reddylab/software/cwl/GGR-cwl/v1.0/json-generator/run.py \\\n", " -m ${METADATA} \\\n", " -d ${PROCESSED_DATA_DIR} \\\n", " -o /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/jsons \\\n", " -t chip-seq \\\n", " --mem 24000 \\\n", " --nthreads 16 \\\n", " --separate-jsons\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execute file to create JSON files" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "source /data/reddylab/software/miniconda2/bin/activate cwl10\n", "sbatch -o /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/CAL51_chiprep_cwl_json_gen.out \\\n", " --depend afterok:$1 \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/cwl_json_gen_CAL51_chiprep.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create SLURM array master bash file for se samples" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/CAL51_chiprep-se.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/CAL51_chiprep-se.sh\n", "#!/bin/bash\n", "#SBATCH --job-name=cwl_chip_seq\n", "#SBATCH --output=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/CAL51_chiprep-se-%a.out\n", "#SBATCH --mail-user=xa2@duke.edu\n", "#SBATCH --mail-type=FAIL,END\n", "#SBATCH --mem=24000\n", "#SBATCH --cpus-per-task=16\n", "\n", "export PATH=\"/data/reddylab/software/bin:$PATH\"\n", "export PATH=\"/data/reddylab/software/cwl/bin:$PATH\"\n", "export PATH=\"/data/reddylab/software/preseq_v2.0:$PATH\"\n", "export PATH=\"/data/reddylab/software/rsem-1.2.21/:$PATH\"\n", "export PATH=\"/data/reddylab/software/phantompeakqualtools-1.2/:$PATH\"\n", "export PATH=\"/data/reddylab/software/miniconda2/envs/cwl10/bin:$PATH\"\n", "\n", "module load bedtools2\n", "module load fastqc\n", "module load samtools\n", "\n", "# For Fastqc\n", "export DISPLAY=:0.0\n", "\n", "# Make sure temporary files and folders are created in a specific folder\n", "mkdir -p /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/tmpdirs/tmp-CAL51_chiprep-se-${SLURM_ARRAY_TASK_ID}-\n", "export TMPDIR=\"/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/tmpdirs/tmp-CAL51_chiprep-se-${SLURM_ARRAY_TASK_ID}-\"\n", "\n", "cwltool --debug \\\n", " --non-strict \\\n", " --preserve-environment PATH \\\n", " --preserve-environment DISPLAY \\\n", " --preserve-environment TMPDIR \\\n", " --outdir /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/CAL51_chiprep-se \\\n", " --no-container \\\n", " /data/reddylab/software/cwl/GGR-cwl/v1.0/ChIP-seq_pipeline/pipeline-se.cwl \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/jsons/chip_seq_download_metadata.CAL51_chiprep-se-${SLURM_ARRAY_TASK_ID}.json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execute SLURM array master file" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "source /data/reddylab/software/miniconda2/bin/activate cwl10\n", "sbatch -p new,all \\\n", " --depend afterok:$1 \\\n", " --array 0-15%20 \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/CAL51_chiprep-se.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'blocking_job_str' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mblocking_job\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Submitted batch job (\\d+).*'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblocking_job_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'blocking_job_str' is not defined" ] } ], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create QC generating script" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_qc_cell_CAL51_chiprep-se.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_qc_cell_CAL51_chiprep-se.sh\n", "#!/bin/bash\n", "#SBATCH --job-name=qc\n", "#SBATCH --output=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/qc_gen.CAL51_chiprep-se.out\n", "\n", "source /data/reddylab/software/miniconda2/bin/activate alex\n", "cd /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/CAL51_chiprep-se\n", "\n", "python /data/reddylab/software/cwl/bin/generate_stats_chipseq_single_end.py ./ \\\n", " -samples $(/bin/ls -1 *PBC.txt | sed 's@.PBC.txt@@') \\\n", "> qc.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate QCs for CAL51_chiprep-se" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "sbatch -p new,all \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_qc_cell_CAL51_chiprep-se.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create plot generating script" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_plot.CAL51_chiprep-se.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_plot.CAL51_chiprep-se.sh\n", "#!/bin/bash\n", "#SBATCH --job-name=generate_fingerplot\n", "#SBATCH --output=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/generate-plot.CAL51_chiprep-se-%a.out\n", "\n", "# This script is a subscript ran from `countFactors_standard.sh` for\n", "# data insertion into the ChIP-DB web application. Intended for samples\n", "# that follow the Reddy Lab sequencing sample naming conventions.\n", "\n", "METADATA=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata/chip_seq_download_metadata.CAL51_chiprep.txt\n", "IN_DIR=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/CAL51_chiprep-se\n", "OUT_DIR=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/CAL51_chiprep-se\n", "\n", "# Initialize and read in the indices for each field\n", "FLOWCELL_INDEX=-1; FACTOR_INDEX=-1; INPUT_CTRL_INDEX=-1\n", "HEADER=$(head -n 1 ${METADATA} | tr '[:upper:]' '[:lower:]')\n", "IFS=$'\\t' read -ra ADDR <<< \"$HEADER\"\n", "for i in \"${!ADDR[@]}\"; do\n", " # If index denotes field, change the index. Cut is 1-indexed, add 1\n", " if [[ ${ADDR[$i]} = 'name' ]] ; then\n", " FACTOR_INDEX=$((i+1))\n", " elif [[ ${ADDR[$i]} = 'experiment name' ]] ; then\n", " FLOWCELL_INDEX=$((i+1))\n", " elif [[ ${ADDR[$i]} = 'Control' ]]; then\n", " INPUT_CTRL_INDEX=$((i+1))\n", " fi\n", "done\n", "\n", "# Add 2 to skip 0th line and 1st line which is header\n", "LINE_NUM=$((${SLURM_ARRAY_TASK_ID}+2))\n", "FILE_LINE=$(sed -n \"${LINE_NUM}p\" ${METADATA})\n", "FLOWCELL=$(echo \"$FILE_LINE\" | cut -f${FLOWCELL_INDEX})\n", "FACTOR=$(echo \"$FILE_LINE\" | cut -f${FACTOR_INDEX})\n", "FACTOR_FILE=$(/bin/ls -1 ${IN_DIR}/${FACTOR}*.bam.bai | sed \"s/.bai//\")\n", "INPUT_CTRL=$(echo \"$FILE_LINE\" | cut -f${INPUT_CTRL_INDEX})\n", "INPUT_CTRL_FILE=$(/bin/ls -1 ${IN_DIR}/${INPUT_CTRL}*bam)\n", "if [ $INPUT_CTRL_INDEX == -1 ];\n", "then\n", " INPUT_CTRL=\"\"\n", " INPUT_CTRL_FILE=\"\"\n", "fi\n", "echo \"Factor is: ${FACTOR}, file is: ${FACTOR_FILE}\"\n", "echo \"Flowcell is: ${FLOWCELL}, Input control is: ${INPUT_CTRL}\"\n", "\n", "# Write sample metadata to file with name, timestamp and\n", "# additional information\n", "METADATA_FILE=\"${OUT_DIR}/${FACTOR}_metadata.txt\"\n", "TIMESTAMP=$(stat -c%z ${METADATA} | cut -d' ' -f1)\n", "HEADER=\"Factor\\tFlowcell\\tInput_control\\tTimestamp\"\n", "echo -e ${HEADER} > ${METADATA_FILE}\n", "echo -e \"${FACTOR}\\t${FLOWCELL}\\t${INPUT_CTRL}\\t${TIMESTAMP}\" >> ${METADATA_FILE}\n", "\n", "\n", "# Case where sample has w/ Input control\n", "if [ $INPUT_CTRL_INDEX != -1 ];\n", "then\n", " echo \"Sample, ${FACTOR_FILE}, has Input control ${INPUT_CTRL}\"\n", " plotFingerprint -b ${FACTOR_FILE} ${INPUT_CTRL_FILE} \\\n", " --labels ${FACTOR} ${INPUT_CTRL} \\\n", " --outQualityMetrics ${OUT_DIR}/${FACTOR}_QCmetrics.txt \\\n", " -T \"Fingerprint of ${FACTOR}\" \\\n", " -plot ${OUT_DIR}/${FACTOR}.png \\\n", " --outRawCounts ${OUT_DIR}/${FACTOR}_counts.tab\n", "# Case where sample has no control\n", "else\n", " echo \"Sample, ${FACTOR_FILE}, has no controls\"\n", " plotFingerprint -b ${FACTOR_FILE} \\\n", " --labels ${FACTOR} \\\n", " --outQualityMetrics ${OUT_DIR}/${FACTOR}_QCmetrics.txt \\\n", " -T \"Fingerprint of ${FACTOR}\" \\\n", " -plot ${OUT_DIR}/${FACTOR}.png \\\n", " --outRawCounts ${OUT_DIR}/${FACTOR}_counts.tab\n", "fi\n", "\n", "exit 0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate plots and data for website" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "source /data/reddylab/software/miniconda2/bin/activate cwl10\n", "sbatch -p new,all \\\n", " --array 0-15%5 \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_plot.CAL51_chiprep-se.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create SLURM array master bash file for se-with-control samples" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/CAL51_chiprep-se-with-control.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/CAL51_chiprep-se-with-control.sh\n", "#!/bin/bash\n", "#SBATCH --job-name=cwl_chip_seq\n", "#SBATCH --output=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/CAL51_chiprep-se-with-control-%a.out\n", "#SBATCH --mail-user=xa2@duke.edu\n", "#SBATCH --mail-type=FAIL,END\n", "#SBATCH --mem=24000\n", "#SBATCH --cpus-per-task=16\n", "\n", "export PATH=\"/data/reddylab/software/bin:$PATH\"\n", "export PATH=\"/data/reddylab/software/cwl/bin:$PATH\"\n", "export PATH=\"/data/reddylab/software/preseq_v2.0:$PATH\"\n", "export PATH=\"/data/reddylab/software/rsem-1.2.21/:$PATH\"\n", "export PATH=\"/data/reddylab/software/phantompeakqualtools-1.2/:$PATH\"\n", "export PATH=\"/data/reddylab/software/miniconda2/envs/cwl10/bin:$PATH\"\n", "\n", "module load bedtools2\n", "module load fastqc\n", "module load samtools\n", "\n", "# For Fastqc\n", "export DISPLAY=:0.0\n", "\n", "# Make sure temporary files and folders are created in a specific folder\n", "mkdir -p /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/tmpdirs/tmp-CAL51_chiprep-se-with-control-${SLURM_ARRAY_TASK_ID}-\n", "export TMPDIR=\"/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/tmpdirs/tmp-CAL51_chiprep-se-with-control-${SLURM_ARRAY_TASK_ID}-\"\n", "\n", "cwltool --debug \\\n", " --non-strict \\\n", " --preserve-environment PATH \\\n", " --preserve-environment DISPLAY \\\n", " --preserve-environment TMPDIR \\\n", " --outdir /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/CAL51_chiprep-se-with-control \\\n", " --no-container \\\n", " /data/reddylab/software/cwl/GGR-cwl/v1.0/ChIP-seq_pipeline/pipeline-se-with-control.cwl \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/jsons/chip_seq_download_metadata.CAL51_chiprep-se-with-control-${SLURM_ARRAY_TASK_ID}.json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execute SLURM array master file" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "source /data/reddylab/software/miniconda2/bin/activate cwl10\n", "sbatch -p new,all \\\n", " --depend afterok:$1 \\\n", " --array 0-28%20 \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/CAL51_chiprep-se-with-control.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create QC generating script" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_qc_cell_CAL51_chiprep-se-with-control.sh\n" ] } ], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_qc_cell_CAL51_chiprep-se-with-control.sh\n", "#!/bin/bash\n", "#SBATCH --job-name=qc\n", "#SBATCH --output=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/qc_gen.CAL51_chiprep-se-with-control.out\n", "\n", "source /data/reddylab/software/miniconda2/bin/activate alex\n", "cd /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/CAL51_chiprep-se-with-control\n", "\n", "python /data/reddylab/software/cwl/bin/generate_stats_chipseq_single_end.py ./ \\\n", " -samples $(/bin/ls -1 *PBC.txt | sed 's@.PBC.txt@@') \\\n", "> qc.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate QCs for CAL51_chiprep-se-with-control" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "sbatch -p new,all \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_qc_cell_CAL51_chiprep-se-with-control.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create plot generating script" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%writefile /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_plot.CAL51_chiprep-se-with-control.sh\n", "#!/bin/bash\n", "#SBATCH --job-name=generate_fingerplot\n", "#SBATCH --output=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/logs/generate-plot.CAL51_chiprep-se-with-control-%a.out\n", "\n", "# This script is a subscript ran from `countFactors_standard.sh` for\n", "# data insertion into the ChIP-DB web application. Intended for samples\n", "# that follow the Reddy Lab sequencing sample naming conventions.\n", "\n", "METADATA=/data/reddylab/Hazel/chipseq/chiprep/data/chip_seq/metadata/chip_seq_download_metadata.CAL51_chiprep.txt\n", "IN_DIR=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/CAL51_chiprep-se-with-control\n", "OUT_DIR=/data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/CAL51_chiprep-se-with-control\n", "\n", "# Initialize and read in the indices for each field\n", "FLOWCELL_INDEX=-1; FACTOR_INDEX=-1; INPUT_CTRL_INDEX=-1\n", "HEADER=$(head -n 1 ${METADATA} | tr '[:upper:]' '[:lower:]')\n", "IFS=$'\\t' read -ra ADDR <<< \"$HEADER\"\n", "for i in \"${!ADDR[@]}\"; do\n", " # If index denotes field, change the index. Cut is 1-indexed, add 1\n", " if [[ ${ADDR[$i]} = 'name' ]] ; then\n", " FACTOR_INDEX=$((i+1))\n", " elif [[ ${ADDR[$i]} = 'experiment name' ]] || [[ ${ADDR[$i]} = 'sequencing core project' ]] ; then\n", " FLOWCELL_INDEX=$((i+1))\n", " elif [[ ${ADDR[$i]} = 'Control' ]]; then\n", " INPUT_CTRL_INDEX=$((i+1))\n", " fi\n", "done\n", "\n", "# Add 2 to skip 0th line and 1st line which is header\n", "LINE_NUM=$((${SLURM_ARRAY_TASK_ID}+2))\n", "FILE_LINE=$(sed -n \"${LINE_NUM}p\" ${METADATA})\n", "FLOWCELL=$(echo \"$FILE_LINE\" | cut -f${FLOWCELL_INDEX})\n", "FACTOR=$(echo \"$FILE_LINE\" | cut -f${FACTOR_INDEX})\n", "FACTOR_FILE=$(/bin/ls -1 ${IN_DIR}/${FACTOR}*.bam.bai | sed \"s/.bai//\")\n", "INPUT_CTRL=$(echo \"$FILE_LINE\" | cut -f${INPUT_CTRL_INDEX})\n", "INPUT_CTRL_FILE=$(/bin/ls -1 ${IN_DIR}/${INPUT_CTRL}*bam)\n", "if [ $INPUT_CTRL_INDEX == -1 ];\n", "then\n", " INPUT_CTRL=\"\"\n", " INPUT_CTRL_FILE=\"\"\n", "fi\n", "echo \"Factor is: ${FACTOR}, file is: ${FACTOR_FILE}\"\n", "echo \"Flowcell is: ${FLOWCELL}, Input control is: ${INPUT_CTRL}\"\n", "\n", "# Write sample metadata to file with name, timestamp and\n", "# additional information\n", "METADATA_FILE=\"${OUT_DIR}/${FACTOR}_metadata.txt\"\n", "TIMESTAMP=$(stat -c%z ${METADATA} | cut -d' ' -f1)\n", "HEADER=\"Factor\\tFlowcell\\tInput_control\\tTimestamp\"\n", "echo -e ${HEADER} > ${METADATA_FILE}\n", "echo -e \"${FACTOR}\\t${FLOWCELL}\\t${INPUT_CTRL}\\t${TIMESTAMP}\" >> ${METADATA_FILE}\n", "\n", "\n", "# Case where sample has w/ Input control\n", "if [ $INPUT_CTRL_INDEX != -1 ];\n", "then\n", " echo \"Sample, ${FACTOR_FILE}, has Input control ${INPUT_CTRL}\"\n", " plotFingerprint -b ${FACTOR_FILE} ${INPUT_CTRL_FILE} \\\n", " --labels ${FACTOR} ${INPUT_CTRL} \\\n", " --outQualityMetrics ${OUT_DIR}/${FACTOR}_QCmetrics.txt \\\n", " -T \"Fingerprint of ${FACTOR}\" \\\n", " -plot ${OUT_DIR}/${FACTOR}.png \\\n", " --outRawCounts ${OUT_DIR}/${FACTOR}_counts.tab\n", "# Case where sample has no control\n", "else\n", " echo \"Sample, ${FACTOR_FILE}, has no controls\"\n", " plotFingerprint -b ${FACTOR_FILE} \\\n", " --labels ${FACTOR} \\\n", " --outQualityMetrics ${OUT_DIR}/${FACTOR}_QCmetrics.txt \\\n", " -T \"Fingerprint of ${FACTOR}\" \\\n", " -plot ${OUT_DIR}/${FACTOR}.png \\\n", " --outRawCounts ${OUT_DIR}/${FACTOR}_counts.tab\n", "fi\n", "\n", "exit 0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate plots and data for website" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%script --out blocking_job_str bash -s \"$blocking_job\"\n", "source /data/reddylab/software/miniconda2/bin/activate cwl10\n", "sbatch -p new,all \\\n", " --depend afterok:$1 \\\n", " --array 0-28%5 \\\n", " /data/reddylab/Hazel/chipseq/chiprep/processing/chip_seq/scripts/generate_plot.CAL51_chiprep-se-with-control.sh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract blocking job id" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "blocking_job = re.match('Submitted batch job (\\d+).*', blocking_job_str).group(1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 1 }