Building Sucessful Sequencing Libraries
Slides
#| echo: true
#| eval: false
#################################################################################################################################
All the commands from the tutorial, organized by section. You can execute these commands sequentially in your terminal
#################################################################################################################################
#Common Command-Line Tools for FASTQ Files
#Ensure all necessary tools are installed on system. You can also install them using package managers like apt, brew,
# grep: Search for patterns/sudo apt-get install grep
# awk: Perform text processing/sudo apt-get install gawk
# sed: Edit text in a stream/sudo apt-get install sed
# seqkit: A versatile toolkit for FASTA/FASTQ files/sudo apt-get install seqkit
# fastp: Quality control and adapter trimming/sudo apt-get install fastp
# cutadapt: Adapter trimming/sudo apt-get install cutadapt/ pip install cutadapt
# seqtk: FASTQ manipulation/sudo apt-get install seqtk
echo "Running this script will make your life easier... unless you forgot to install seqkit. Then it's just pain. 😆"
########################################################################################################
# <<< PART A >>> #
########################################################################################################
# -----------------------------
# Step 1: Understanding the FASTQ File Format
# -----------------------------
#Find the size of FASTQ files
ls -lh 1001.fastq
# View the first 8 lines (2 records)
head -8 1001.fastq
# Count the total number of records
wc -l 1001.fastq | awk '{print $1/4}'
# Search for sequences containing "GATTTG"
grep -B 1 -A 2 "GATTTG" 1001.fastq
#| echo: true
#| eval: false
# -----------------------------
# Step 2: Filtering FASTQ Records
# -----------------------------
# Filter sequences with average quality > 30
seqtk seq -q 30 -Q 33 1001.fastq > filtered.fastq
#| echo: true
#| eval: false
# -----------------------------
# Step 3: Trimming Adapters
# -----------------------------
# Search for adapter sequence "GGCCTCAGTGAAGGTCTCCTGCAAG"
grep -A 1 "GGCCTCAGTGAAGGTCTCCTGCAAG" 1001.fastq | less
# Trim adapters using cutadapt
cutadapt -a grep -A 1 GGCCTCAGTGAAGGTCTCCTGCAAG 1001.fastq | less -o trimmed.fastq 1001.fastq
#For paired Reads
cutadapt -a GGCCTCAGTGAAGGTCTCCTGCAAG -A REVERSE_ADAPTER -o trimmed_R1.fastq -p trimmed_R2.fastq file_R1.fastq file_R2.fastq
# Trim adapters and perform quality control using fastp
fastp -i 1001.fastq -o trimmed.fastq -a GGCCTCAGTGAAGGTCTCCTGCAAG
#| echo: true
#| eval: false
# -----------------------------
# Step 4: Advanced Adapter Identification
# -----------------------------
# De novo adapter detection using FastQC
fastqc 1001.fastq
# Plot adapter content using fastp
fastp -i 1001.fastq -o output.fastq --detect_adapter_for_pe --html adapter_report.html
#| echo: true
#| eval: false
# -----------------------------
# Step 4: Manipulating FASTQ Files
# -----------------------------
# Convert FASTQ to FASTA
seqtk seq -A 1001.fastq > 1001.fasta
# Split a large FASTQ file into smaller files (e.g., 1 million reads each)
seqkit split2 -p 10 1001.fastq
# Combine two FASTQ files (e.g., paired-end reads)
cat file1.fastq file2.fastq > merged.fastq
# -----------------------------
# Step 4: Extract Subsets of Reads
# -----------------------------
# Extract reads longer than 50 bp using awk
awk 'NR%4==2 && length($0) > 50 {print $0}' 1001.fastq > long_reads.fastq
# Randomly select 10,000 reads using seqtk
seqtk sample -s100 1001.fastq 10000 > subsample.fastq
# -----------------------------
# Step 5: Compressing and Decompressing FASTQ Files
# -----------------------------
# Compress a FASTQ file
gzip 1001.fastq
# Decompress a FASTQ file
gunzip 1001.fastq.gz
# Work with compressed files directly
zcat 1001.fastq.gz | head -8
#| echo: true
#| eval: false
# -----------------------------
# Step 6: Troubleshooting Tips
# -----------------------------
# Ensure FASTQ integrity by checking line count
wc -l 1001.fastq
# Check for duplicate sequence IDs
grep "^@" 1001.fastq | sort | uniq -c | sort -nr | head
# Repair corrupted paired-end files using seqkit
seqkit pair file1.fastq file2.fastq
#| echo: true
#| eval: false
########################################################################################################
# <<< PART B >>> #
########################################################################################################
#B.1 Understanding For loops, using Fruits examples
#########################################################
#!/bin/bash
# Demonstrating a FOR loop with fruits
echo "FOR loop example:"
for fruit in Apple Banana Cherry Date Elderberry; do
echo "Fruit: $fruit"
done
qc_pipeline.sh
#| echo: true
#| eval: false
#B.2 Applying it to QC
#########################################################
#!/bin/bash
INPUT_FASTQ_DIR="/home/afrahkhairallah/B/Run1/Run_1_24July2023/Miseq/Example_Files"
RAW_DATA_DIR="/home/afrahkhairallah/B/Run1/Run_1_24July2023/Miseq/Example_Files"
QC_REPORTS_DIR="/home/afrahkhairallah/B/Run1/Run_1_24July2023/Miseq/Example_Files/qc_reports"
mkdir -p $QC_REPORTS_DIR
echo "FASTQC running... If your quality scores look bad, just remember: even the best scientists make ‘random errors’ too"
for fastq_file in $RAW_DATA_DIR/*.fastq; do
echo "Processing $fastq_file ..."
fastqc "$fastq_file" -o "$QC_REPORTS_DIR"
echo "Done with $fastq_file"
done
Now to run the script:
#| echo: true
#| eval: false
echo "Running MultiQC ..."
multiqc $QC_REPORTS_DIR -o $QC_REPORTS_DIR
echo "Processing FASTQ files… because Excel said ‘File too large.’ 💀"
echo "QC pipeline completed."
chmod +x qc_pipeline.sh
./qc_pipeline.sh
########################################################################################################
# <<< PART C >>> #
########################################################################################################
#!/bin/bash
echo "Bioinformatics: Where we spend 90% of our time debugging, and the other 10% pretending it’s not our fault. 🧑💻🧐"
# Define input and output directories
INPUT_FASTQ_DIR="/home/afrahkhairallah/B/Run1/Run_1_24July2023/Miseq/Example_Files"
OUTPUT_DIR="/home/afrahkhairallah/B/Run1/Run_1_24July2023/Miseq/Example_Files/output"
# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
# -----------------------------
# Step 1: Understand FASTQ Files
# -----------------------------
echo "Step 1: Inspecting the FASTQ file format..."
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
echo "Processing file: $FILE"
head -8 "$FILE"
echo "Total records in $FILE:"
wc -l "$FILE" | awk '{print $1/4}'
done
# -----------------------------
# Step 2: Search for Patterns
# -----------------------------
echo "Step 2: Searching for a specific sequence (e.g., GATTTG)..."
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
echo "Searching in: $FILE"
grep -B 1 -A 2 "GATTTG" "$FILE" | head -10
done
# -----------------------------
# Step 3: Filter FASTQ Records by Quality
# -----------------------------
echo "Step 3: Filtering FASTQ records with quality scores above 30..."
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
OUTPUT_FILE="$OUTPUT_DIR/$(basename "$FILE" .fastq)_filtered.fastq"
seqtk seq -q 30 -Q 33 "$FILE" > "$OUTPUT_FILE"
done
echo "ERROR: Just kidding. Everything is fine. Probably. Maybe. Who knows?"
# -----------------------------
# Step 4: Extract Records by ID
# -----------------------------
echo "Step 4: Extracting specific records by IDs..."
IDS_FILE="$OUTPUT_DIR/ids.txt"
echo -e "SEQ_ID_1\nSEQ_ID_2" > "$IDS_FILE"
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
OUTPUT_FILE="$OUTPUT_DIR/$(basename "$FILE" .fastq)_extracted.fastq"
seqtk subseq "$FILE" "$IDS_FILE" > "$OUTPUT_FILE"
done
# -----------------------------
# Step 5: Trim Adapters
# -----------------------------
echo "Step 5: Trimming adapters (e.g., GGCCTCAGTGAAGGTCTCCTGCAAG)..."
ADAPTER="GGCCTCAGTGAAGGTCTCCTGCAAG"
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
OUTPUT_FILE="$OUTPUT_DIR/$(basename "$FILE" .fastq)_trimmed.fastq"
cutadapt -a "$ADAPTER" -o "$OUTPUT_FILE" "$FILE"
done
echo "Adapter trimming complete! Now your reads are like my social skills—shorter, but still functional. 😆"
# -----------------------------
# Step 6: Perform Quality Control
# -----------------------------
echo "Step 6: Generating quality control reports..."
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
fastqc -o "$OUTPUT_DIR" "$FILE"
done
# -----------------------------
# Step 7: Convert FASTQ to FASTA
# -----------------------------
echo "Step 7: Converting FASTQ to FASTA format..."
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
OUTPUT_FILE="$OUTPUT_DIR/$(basename "$FILE" .fastq).fasta"
seqtk seq -A "$FILE" > "$OUTPUT_FILE"
done
# -----------------------------
# Step 8: Subsample Reads
# -----------------------------
echo "Step 8: Randomly subsampling 10,000 reads..."
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
OUTPUT_FILE="$OUTPUT_DIR/$(basename "$FILE" .fastq)_subsampled.fastq"
seqtk sample -s100 "$FILE" 10000 > "$OUTPUT_FILE"
done
# -----------------------------
# Step 9: Extract Reads by Length
# -----------------------------
echo "Step 9: Extracting reads longer than 50bp..."
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
OUTPUT_FILE="$OUTPUT_DIR/$(basename "$FILE" .fastq)_longreads.fastq"
awk 'NR%4==2 && length($0) > 50' "$FILE" > "$OUTPUT_FILE"
done
echo "If this script fails, remember: it's not a bug, it's an *undocumented feature*"
# -----------------------------
# Step 10: Split FASTQ Files
# -----------------------------
echo "Step 10: Splitting FASTQ file into smaller chunks..."
SPLIT_DIR="$OUTPUT_DIR/split"
mkdir -p "$SPLIT_DIR"
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
seqkit split2 -p 5 -O "$SPLIT_DIR" "$FILE"
done
# -----------------------------
# Step 11: Compress FASTQ Files
# -----------------------------
echo "Step 11: Compressing FASTQ files..."
echo "This might take a while. Go grab a coffee ☕"
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
OUTPUT_FILE="$OUTPUT_DIR/$(basename "$FILE" .fastq)_compressed.fastq.gz"
gzip -c "$FILE" > "$OUTPUT_FILE"
done
# -----------------------------
# Step 12: Check FASTQ Integrity
# -----------------------------
echo "Step 12: Checking FASTQ file integrity..."
for FILE in "$INPUT_FASTQ_DIR"/*.fastq; do
echo "Checking integrity of: $FILE"
awk 'NR%4==1' "$FILE" | wc -l
awk 'NR%4==0' "$FILE" | wc -l
done
# -----------------------------
# Completion Message
# -----------------------------
echo "FASTQ processing completed. Results are stored in '$OUTPUT_DIR'."
echo "Script complete! If everything worked, congrats! 🎉"