We then checked the sequencing runs quality using nanoplot version ...:
In [ ]:
NanoPlot --summary sequencing_summary.txt -t 20 -o /path/to/output/directory
According to https://doi.org/10.1371/journal.pcbi.1010905 filtering of the nanopore reads is important. We done this using filtlong version ... (To keep all reads bigger than 6kb and to throw away the 10 % worstest reads of them).
In [ ]:
filtlong --min_length 6000 calls.fastq > reads_qc/ont_6k.fastq
filtlong --keep_percent 90 reads_qc/ont_6k.fastq > reads_qc/ont.fastq
Preparation of the illumina data¶
For the illumina data, we used fastp for filtering:
In [ ]:
fastp --in1 reads_Illumina_1.fastq.gz --in2 reads_Illumina_2.fastq.gz --out1 reads_qc/illumina_1.fastq.gz --out2 reads_qc/illumina_2.fastq.gz --unpaired1 reads_qc/illumina_u.fastq.gz --unpaired2 reads_qc/illumina_u.fastq.gz
Nanopore only assembly using trycycler¶
In [ ]:
trycycler subsample --reads reads_qc/ont.fastq --out_dir read_subsets --genome_size 4.5m
threads=16 # change as appropriate for your system
mkdir assemblies
flye --nano-hq read_subsets/sample_01.fastq --threads "$threads" --out-dir assembly_01 && cp assembly_01/assembly.fasta assemblies/assembly_01.fasta && rm -r assembly_01
miniasm_and_minipolish.sh read_subsets/sample_02.fastq "$threads" > assembly_02.gfa && any2fasta assembly_02.gfa > assemblies/assembly_02.fasta && rm assembly_02.gfa
raven --threads "$threads" --disable-checkpoints read_subsets/sample_03.fastq > assemblies/assembly_03.fasta
flye --nano-hq read_subsets/sample_04.fastq --threads "$threads" --out-dir assembly_04 && cp assembly_04/assembly.fasta assemblies/assembly_04.fasta && rm -r assembly_04
miniasm_and_minipolish.sh read_subsets/sample_05.fastq "$threads" > assembly_05.gfa && any2fasta assembly_05.gfa > assemblies/assembly_05.fasta && rm assembly_05.gfa
raven --threads "$threads" --disable-checkpoints read_subsets/sample_06.fastq > assemblies/assembly_06.fasta
flye --nano-hq read_subsets/sample_07.fastq --threads "$threads" --out-dir assembly_07 && cp assembly_07/assembly.fasta assemblies/assembly_07.fasta && rm -r assembly_07
miniasm_and_minipolish.sh read_subsets/sample_08.fastq "$threads" > assembly_08.gfa && any2fasta assembly_08.gfa > assemblies/assembly_08.fasta && rm assembly_08.gfa
raven --threads "$threads" --disable-checkpoints read_subsets/sample_09.fastq > assemblies/assembly_09.fasta
flye --nano-hq read_subsets/sample_10.fastq --threads "$threads" --out-dir assembly_10 && cp assembly_10/assembly.fasta assemblies/assembly_10.fasta && rm -r assembly_10
miniasm_and_minipolish.sh read_subsets/sample_11.fastq "$threads" > assembly_11.gfa && any2fasta assembly_11.gfa > assemblies/assembly_11.fasta && rm assembly_11.gfa
raven --threads "$threads" --disable-checkpoints read_subsets/sample_12.fastq > assemblies/assembly_12.fasta
trycycler cluster --assemblies assemblies/*.fasta --reads reads_qc/ont.fastq --out_dir trycycler
trycycler reconcile --reads reads_qc/ont.fastq --cluster_dir trycycler/cluster_001
trycycler reconcile --reads reads_qc/ont.fastq --cluster_dir trycycler/cluster_002
trycycler reconcile --reads reads_qc/ont.fastq --cluster_dir trycycler/cluster_003
trycycler reconcile --reads reads_qc/ont.fastq --cluster_dir trycycler/cluster_004
trycycler reconcile --reads reads_qc/ont.fastq --cluster_dir trycycler/cluster_005
trycycler reconcile --reads reads_qc/ont.fastq --cluster_dir trycycler/cluster_006
trycycler msa --cluster_dir trycycler/cluster_001
trycycler msa --cluster_dir trycycler/cluster_002
trycycler msa --cluster_dir trycycler/cluster_003
trycycler msa --cluster_dir trycycler/cluster_004
trycycler msa --cluster_dir trycycler/cluster_005
trycycler msa --cluster_dir trycycler/cluster_006
trycycler partition --reads reads_qc/ont.fastq --cluster_dirs trycycler/cluster_*
trycycler consensus --cluster_dir trycycler/cluster_001
trycycler consensus --cluster_dir trycycler/cluster_002
trycycler consensus --cluster_dir trycycler/cluster_003
trycycler consensus --cluster_dir trycycler/cluster_004
trycycler consensus --cluster_dir trycycler/cluster_005
trycycler consensus --cluster_dir trycycler/cluster_006
medaka_consensus -i trycycler/cluster_001/4_reads.fastq -d trycycler/cluster_001/7_final_consensus.fasta -o trycycler/cluster_001/medaka -m r104_e81_sup_g610
mv trycycler/cluster_001/medaka/consensus.fasta trycycler/cluster_001/8_medaka.fasta
rm -r trycycler/cluster_001/medaka trycycler/cluster_001/*.fai trycycler/cluster_001/*.mmi
medaka_consensus -i trycycler/cluster_002/4_reads.fastq -d trycycler/cluster_002/7_final_consensus.fasta -o trycycler/cluster_002/medaka -m r104_e81_sup_g610
mv trycycler/cluster_002/medaka/consensus.fasta trycycler/cluster_002/8_medaka.fasta
rm -r trycycler/cluster_002/medaka trycycler/cluster_002/*.fai trycycler/cluster_002/*.mmi
medaka_consensus -i trycycler/cluster_003/4_reads.fastq -d trycycler/cluster_003/7_final_consensus.fasta -o trycycler/cluster_003/medaka -m r104_e81_sup_g610
mv trycycler/cluster_003/medaka/consensus.fasta trycycler/cluster_003/8_medaka.fasta
rm -r trycycler/cluster_003/medaka trycycler/cluster_003/*.fai trycycler/cluster_003/*.mmi
medaka_consensus -i trycycler/cluster_004/4_reads.fastq -d trycycler/cluster_004/7_final_consensus.fasta -o trycycler/cluster_004/medaka -m r104_e81_sup_g610
mv trycycler/cluster_004/medaka/consensus.fasta trycycler/cluster_004/8_medaka.fasta
rm -r trycycler/cluster_004/medaka trycycler/cluster_004/*.fai trycycler/cluster_004/*.mmi
medaka_consensus -i trycycler/cluster_005/4_reads.fastq -d trycycler/cluster_005/7_final_consensus.fasta -o trycycler/cluster_005/medaka -m r104_e81_sup_g610
mv trycycler/cluster_005/medaka/consensus.fasta trycycler/cluster_005/8_medaka.fasta
rm -r trycycler/cluster_005/medaka trycycler/cluster_005/*.fai trycycler/cluster_005/*.mmi
medaka_consensus -i trycycler/cluster_006/4_reads.fastq -d trycycler/cluster_006/7_final_consensus.fasta -o trycycler/cluster_006/medaka -m r104_e81_sup_g610
mv trycycler/cluster_006/medaka/consensus.fasta trycycler/cluster_006/8_medaka.fasta
rm -r trycycler/cluster_006/medaka trycycler/cluster_006/*.fai trycycler/cluster_006/*.mmi
cat trycycler/cluster_*/8_medaka.fasta > trycycler_medaka.fasta
Filtering using polypolish¶
In [ ]:
threads=16
bwa index trycycler_medaka.fasta
bwa mem -t "$threads" -a trycycler_medaka.fasta reads_qc/illumina_1.fastq.gz > alignments_1.sam
bwa mem -t "$threads" -a trycycler_medaka.fasta reads_qc/illumina_2.fastq.gz > alignments_2.sam
polypolish filter --in1 alignments_1.sam --in2 alignments_2.sam --out1 filtered_1.sam --out2 filtered_2.sam
polypolish polish trycycler_medaka.fasta filtered_1.sam filtered_2.sam > trycycler_medaka_polypolish.fasta
rm *.bwt *.pac *.ann *.amb *.sa *.sam
Adjust start points of the assembly to dnaA or repA using dnaapler¶
In [ ]:
dnaapler all -i trycycler_medaka_polypolish.fasta -o output_directory_path -p r_sphaeroides -t 20
In [ ]:
dorado basecaller sup --emit-fastq pod5s > calls.fastq
QC check of the reads were performed in the same way, as described above (nanoplot and filtlong for the nanopore data and fastqc and fastp for the illumina data).
Fully automated assembly using autocycler¶
In [ ]:
reads=ont.fastq.gz # your read set goes here
threads=20 # set as appropriate for your system (no more than 128)
genome_size=$(autocycler helper genome_size --reads "$reads" --threads "$threads") # can set this manually if you know the value
# Step 1: subsample the long-read set into multiple files
autocycler subsample --reads "$reads" --out_dir subsampled_reads --genome_size "$genome_size"
# Step 2: assemble each subsampled file
mkdir assemblies
for assembler in canu flye metamdbg miniasm necat nextdenovo plassembler raven; do
for i in 01 02 03 04; do
autocycler helper "$assembler" --reads subsampled_reads/sample_"$i".fastq --out_prefix assemblies/"$assembler"_"$i" --threads "$threads" --genome_size "$genome_size"
done
done
# Optional step: remove the subsampled reads to save space
rm subsampled_reads/*.fastq
# Step 3: compress the input assemblies into a unitig graph
autocycler compress -i assemblies -a autocycler_out
# Step 4: cluster the input contigs into putative genomic sequences
autocycler cluster -a autocycler_out
# Steps 5 and 6: trim and resolve each QC-pass cluster
for c in autocycler_out/clustering/qc_pass/cluster_*; do
autocycler trim -c "$c"
autocycler resolve -c "$c"
done
# Step 7: combine resolved clusters into a final assembly
autocycler combine -a autocycler_out -i autocycler_out/clustering/qc_pass/cluster_*/5_final.gfa
autocycler gfa2fasta -i autocycler_out/clustering/qc_pass/cluster_*/5_final.gfa -o autocycler_out/final/autocycler.fasta
Short read polishing using pypolca and polypolish¶
In [ ]:
pypolca run -a autocycler_out/final/autocycler.fasta -1 reads_qc/illumina_1.fastq.gz -2 reads_qc/illumina_2.fastq.gz -t 20 -o autocycler_out/final/autocycler_pypolca.fasta
bwa index autocycler_out/final/autocycler_pypolca.fasta
bwa mem -t 16 -a autocycler_out/final/autocycler_pypolca.fasta reads_qc/illumina_1.fastq.gz > alignments_1.sam
bwa mem -t 16 -a autocycler_out/final/autocycler_pypolca.fasta reads_qc/illumina_2.fastq.gz > alignments_2.sam
polypolish filter --in1 alignments_1.sam --in2 alignments_2.sam --out1 filtered_1.sam --out2 filtered_2.sam
polypolish polish autocycler_out/final/autocycler_pypolca.fasta filtered_1.sam filtered_2.sam > autocycler_out/final/autocycler_pypolca_polypolish.fasta
rm *.amb *.ann *.bwt *.pac *.sa *.sam
Adjust the starting point to dnaA or repA of each contig using dnaapler¶
In [ ]:
dnaapler all -i autocycler_out/final/autocycler_pypolca_polypolish.fasta -o output_directory_path -p r_sphaeroides -t 20