R scripts for: “Hybrid genome assemblies of R. sphaeroides DSM158 and its substrain H2”

Author

Anna Nauruschat

Circos plot of the assemblies using circlize

Loading the required packages

library(Biostrings)     # version 2.70.3
library(rtracklayer)    # version 1.62.0
library(GenomicRanges)  # version 1.54.1
library(circlize)       # version 0.4.16
library(data.table)     # version 1.17.8
library(RColorBrewer)   # version 1.1-3
library(ggsci)          # version 4.0.0
library(scales)

Import the genomes and the genome annotations

genomeA <- readDNAStringSet("/home/Drives/HDD03_06T_SDE/anna/SyntenyPlotDSMSubH2/DSM158/Cereibacter_sphaeroides_reoriented.fasta")
gffA <- import("/home/Drives/HDD03_06T_SDE/anna/SyntenyPlotDSMSubH2/DSM158/annot.gff")

genomeB <- readDNAStringSet("/home/Drives/HDD03_06T_SDE/anna/SyntenyPlotDSMSubH2/SubH2/GCF_049434525.1_MWCSPHH2ANNA_genomic.fasta")
gffB <- import("/home/Drives/HDD03_06T_SDE/anna/SyntenyPlotDSMSubH2/SubH2/genomic.gff")

Tidying the sequence names (remove file extension) and rename the chromosomes

#remove the file extensions (.fasta or .gff) from the sequence names
names(genomeA) <- sub(" .*", "", names(genomeA))
names(genomeB) <- sub(" .*", "", names(genomeB))
seqlevels(gffA) <- sub(" .*", "", seqlevels(gffA))
seqlevels(gffB) <- sub(" .*", "", seqlevels(gffB))

#Rename the chromosomes A_... or B_...
new_namesA <- paste0("A_", names(genomeA))
new_namesB <- paste0("B_", names(genomeB))

#Extract the lengths for each chromosome
chrom_lensA <- data.frame(chr=new_namesA, start=0, end=width(genomeA))
chrom_lensB <- data.frame(chr=new_namesB, start=0, end=width(genomeB))
chrom_lens <- rbind(chrom_lensA, chrom_lensB)

#Initialize the sectors for circlize
chrom_lens$chr <- as.character(chrom_lens$chr)

Create a sliding window (for the calculation of gene densities and GC content) of size 10,000 bp

win_size <- 10000
windowsA <- tileGenome(seqlengths(genomeA), tilewidth=win_size, cut.last.tile.in.chrom=TRUE)
windowsB <- tileGenome(seqlengths(genomeB), tilewidth=win_size, cut.last.tile.in.chrom=TRUE)

Calculate the gene densities

genesA <- gffA[gffA$type=="gene"]
gene_countsA <- countOverlaps(windowsA, genesA)
gendichteA <- data.frame(
  chr=factor(paste0("A_", seqnames(windowsA)), levels=new_namesA),
  start=start(windowsA), end=end(windowsA), value=gene_countsA
)

genesB <- gffB[gffB$type=="gene"]
gene_countsB <- countOverlaps(windowsB, genesB)
gendichteB <- data.frame(
  chr=factor(paste0("B_", seqnames(windowsB)), levels=new_namesB),
  start=start(windowsB), end=end(windowsB), value=gene_countsB
)

Calculate the GC content

calc_gc <- function(genome, windows, prefix) {
  vals <- numeric(length(windows))
  for (i in seq_along(windows)) {
    chr <- as.character(seqnames(windows[i]))
    seq <- subseq(genome[[chr]], start(windows[i]), end(windows[i]))
    vals[i] <- sum(letterFrequency(seq, c("G","C"), as.prob=TRUE))
  }
  data.frame(
    chr=factor(paste0(prefix, seqnames(windows)), levels=c(new_namesA,new_namesB)),
    start=start(windows), end=end(windows), value=vals
  )
}

gcA <- calc_gc(genomeA, windowsA, "A_")
gcB <- calc_gc(genomeB, windowsB, "B_")

Data for the phage sequences (PHASTER web-server)

repeatA <- data.frame(
  chr = c("1","1","1","1", "1", "2", "2", "2"),
  start = c(299942, 312405, 714792, 1045831, 1184739, 16443, 36955, 184600),
  end   = c(314808, 334525, 756993, 1067027, 1202250, 38142, 54813, 208162),
  type  = c("Incomplete","Incomplete","Questionable","Incomplete", "Incomplete","Incomplete", "Incomplete","Incomplete")
)
repeatB <- data.frame(
  chr = c("NZ_CP186561.1","NZ_CP186561.1","NZ_CP186561.1","NZ_CP186561.1", "NZ_CP186561.1", "NZ_CP186562.1", "NZ_CP186562.1", "NZ_CP186562.1"),
  start = c(299941, 312404, 714790, 1045832, 1184740, 9996, 157638, 920625),
  end   = c(314807, 333081, 756989, 1067028, 1202251, 27853, 181200, 944010),
  type  = c("Incomplete","Incomplete","Questionable","Incomplete", "Incomplete","Incomplete", "Incomplete","Incomplete")
)
repeatA$chr <- paste0("A_", repeatA$chr)
repeatB$chr <- paste0("B_", repeatB$chr)
repeat_df <- rbind(repeatA, repeatB)

Read the coordinates from a MUMMer coordinates file (for the synteny track)

read_mummer_coords <- function(file){
  raw <- readLines(file)
  header_line <- grep("\\[S1\\]", raw)
  data_lines <- raw[(header_line+2):length(raw)]
  data_clean <- gsub("\\|", "", data_lines)
  data_clean <- gsub("\\s+", " ", data_clean)
  coords <- read.table(text=data_clean, header=FALSE, stringsAsFactors=FALSE)
  if(ncol(coords)==8){
    colnames(coords) <- c("startA","endA","startB","endB","lenA","lenB","identity","chrA")
    coords$chrB <- coords$chrA
  } else if(ncol(coords)==11){
    colnames(coords) <- c("startA","endA","startB","endB",
                          "lenA","lenB","identity",
                          "lenRef","lenQry","chrA","chrB")
  } else if(ncol(coords)==13){
    colnames(coords) <- c("startA","endA","startB","endB",
                          "lenA","lenB","identity",
                          "lenRef","lenQry","covRef","covQry","chrA","chrB")
  } else {stop("Unbekanntes Format!")}
  coords
}

coords <- read_mummer_coords("/home/Drives/HDD03_06T_SDE/anna/SyntenyPlotDSMSubH2/MUMer/synteny.coords")
synteny <- data.frame(
  chrA = paste0("A_", coords$chrA),
  startA = coords$startA,
  endA = coords$endA,
  chrB = paste0("B_", coords$chrB),
  startB = coords$startB,
  endB = coords$endB
)

Color palette for the links (npg palette from the `ggsci` package) and for the other tracks

link_colors <- c(pal_npg("nrc", alpha = 0.2)(10), 
                 alpha("#00d087B2",0.2), 
                 alpha("#0c5f88b2",0.2)
                 )

colorA <- "#00a087B2"  # Green for genome A (DSM158)
colorB <- "#0c5488b2"  # Blue for genome B (SubH2)

# match genomes wth colors
chrom_colors <- ifelse(grepl("^A_", chrom_lens$chr), colorA, colorB)

Prepare the new chromosome names

# Chromosome-IDs in the plot
chrom_ids <- chrom_lens$chr

# New names (ordered)
new_labels <- c("Chromosome 1", "Chromosome 2", "PlA",
                "PlB", "PlC", "PlDx","Chromosome 1",
                "Chromosome 2", "PlA", "PlB", "PlC", "PlDx")

# Control if there are the same number og new labels and old labels
stopifnot(length(chrom_ids) == length(new_labels))

# Match new labels and old labels
names(new_labels) <- chrom_ids

Circos plot

circos.clear()
circos.par("start.degree"=90, gap.degree=c(rep(3,5),6,rep(3,5),6), cell.padding = c(0.02, 0, 0.02, 0), track.margin=c(0.01,0.01))

circos.initialize(factors=chrom_lens$chr,
                  xlim=cbind(chrom_lens$start, chrom_lens$end))

# Chromosomes at the outside
circos.trackPlotRegion(
  ylim=c(0,1),
  track.height=0.05,
  bg.border=NA,
  bg.col = chrom_colors,
  panel.fun=function(x,y){
    chr <- CELL_META$sector.index
    circos.text(
      CELL_META$xcenter,
      CELL_META$ycenter+ mm_y(4),
      col = "black",
      labels = new_labels[chr],   # Choosing name for each sector
      facing="inside",
      niceFacing=F,
      cex=0.7
    )
    circos.genomicAxis(h="top", labels.cex = 0.5)
  }
)

Note: 1 point is out of plotting region in sector 'A_1', track '1'.

Note: 1 point is out of plotting region in sector 'A_2', track '1'.

Note: 1 point is out of plotting region in sector 'A_3', track '1'.

Note: 1 point is out of plotting region in sector 'A_4', track '1'.

Note: 1 point is out of plotting region in sector 'A_5', track '1'.

Note: 1 point is out of plotting region in sector 'A_6', track '1'.

Note: 1 point is out of plotting region in sector 'B_NZ_CP186561.1',
track '1'.

Note: 1 point is out of plotting region in sector 'B_NZ_CP186562.1',
track '1'.

Note: 1 point is out of plotting region in sector 'B_NZ_CP186563.1',
track '1'.

Note: 1 point is out of plotting region in sector 'B_NZ_CP186564.1',
track '1'.

Note: 1 point is out of plotting region in sector 'B_NZ_CP186565.1',
track '1'.

Note: 1 point is out of plotting region in sector 'B_NZ_CP186566.1',
track '1'.

# Track 1: gene densities
maxA <- max(gendichteA$value)
maxB <- max(gendichteB$value)

col_funA <- colorRamp2(c(0, maxA), c("#eeeeee", "#00a087B2"))  # grey → green
col_funB <- colorRamp2(c(0, maxB), c("#eeeeee", "#0c5488b2"))  # grey → blue
gendichte_df <- rbind(gendichteA, gendichteB)

circos.track(ylim=c(0,1), track.height=0.1, bg.border=NA,
             panel.fun=function(region,value,...){
               chr <- CELL_META$sector.index
               idx <- gendichte_df$chr == chr
               for(i in which(idx)){
                 col_use <- ifelse(grepl("^A_", chr), col_funA(gendichte_df$value[i]), col_funB(gendichte_df$value[i]))
                 circos.rect(gendichte_df$start[i], 0,
                             gendichte_df$end[i], 1,
                             col=col_use, border=NA)
               }
             })
# Track 2: Phage sequences
repeat_types <- unique(repeat_df$type)
colors_repeats <- setNames(pal_npg("nrc", alpha = 0.7)(2)[1:length(repeat_types)], repeat_types)

circos.track(
  ylim = c(0,1), 
  track.height = 0.1,
  bg.border = "grey", 
  bg.col = "grey95",
  panel.fun = function(region, value, ...) {
    chr <- CELL_META$sector.index
    idx <- repeat_df$chr == chr
    
    if(any(idx)) {
      ybottom <- CELL_META$ylim[1]
      ytop    <- CELL_META$ylim[2]
      
      circos.rect(
        xleft  = repeat_df$start[idx],
        ybottom= ybottom,
        xright = repeat_df$end[idx],
        ytop   = ytop,
        col    = adjustcolor(colors_repeats[repeat_df$type[idx]], alpha.f=0.7),
        border = NA
      )
    }
  }
)
# Track 3: GC content
gc_df <- rbind(gcA,gcB)
circos.track(ylim=c(0,1), track.height=0.1, bg.border="grey", bg.col=NA,
             panel.fun=function(region,value,...){
               chr <- CELL_META$sector.index
               idx <- gc_df$chr == chr
               circos.lines((gc_df$start[idx]+gc_df$end[idx])/2, gc_df$value[idx],
                            col="#e64b35b2", lwd=1)
             })
first_chr <- chrom_lens$chr[1]
circos.yaxis(side="left", sector.index=first_chr,
             labels.cex=0.5, col="darkgrey", labels.col="black", tick.length=mm_y(2))
# Innermost track: synteny
for(i in 1:nrow(synteny)){
  if(synteny$chrA[i] %in% chrom_lens$chr &&
     synteny$chrB[i] %in% chrom_lens$chr){
    circos.link(
      sector.index1=synteny$chrA[i],
      point1=c(synteny$startA[i], synteny$endA[i]),
      sector.index2=synteny$chrB[i],
      point2=c(synteny$startB[i], synteny$endB[i]),
      col = link_colors[i],
      border=NA
    )
  }}

Synteny and phylogenetic tree of 18 complete R. sphaeroides genomes

Visualization of progressiveMauve Output

Loading required packages:

library(ggplot2)  # version 4.0.0
library(dplyr)    # version 1.1.4
library(stringr)  # version 1.5.1
library(scales)   # version 1.3.0
library(readxl)   # version 1.4.3

Function for parsing the `progressiveMauve` output

read_xmfa <- function(file) {
  lines <- readLines(file)
  cat("Number of read lines:", length(lines), "\n")
  
  blocks <- list()
  block_id <- 0
  current_block <- NULL
  
  for (line in lines) {
    # Skip comment lines
    if (startsWith(line, "#")) next
    
    # Block-deliminter
    if (trimws(line) == "=") {
      if (!is.null(current_block)) {
        blocks[[block_id]] <- current_block
        cat("Added Block", block_id, "with", nrow(current_block), "lines\n")
      }
      current_block <- NULL
      next
    }
    
    # Process header-lines
    if (startsWith(line, ">")) {
      cat("Process line:", line, "\n")
      # Example: >1:100-200 +
      pattern <- "^>\\s+([0-9]+):([0-9]+)-([0-9]+)\\s+([+-])\\s+(.*\\_*[0-9]+)\\.{1}.*"
      match <- regmatches(line, regexec(pattern, line))[[1]]
      
      if (length(match) == 6) {
        genome <- as.integer(match[2])
        start  <- as.integer(match[3])
        end    <- as.integer(match[4])
        strand <- match[5]
        genome_name <- match[6]
        
        if (is.null(current_block)) {
          block_id <- block_id + 1
          current_block <- data.frame(
            Genome = integer(),
            Start = integer(),
            End = integer(),
            Strand = character(),
            Block = integer(),
            Genome_Name = character(),
            stringsAsFactors = FALSE
          )
        }
        
        current_block <- rbind(current_block, data.frame(
          Genome = genome,
          Start = start,
          End = end,
          Strand = strand,
          Block = block_id,
          Genome_Name = genome_name,
          stringsAsFactors = FALSE
        ))
      } else {
        cat("No match for line:", line, "\n")
      }
    }
  }
  
  # Add last block, if available
  if (!is.null(current_block)) {
    blocks[[block_id]] <- current_block
    cat("Added last block", block_id, "with", nrow(current_block), "lines\n")
  }
  
  # Convert all blocks into one dataframe
  df <- do.call(rbind, blocks)
  cat("Final number of rows in the dataframe:", nrow(df), "\n")
  return(df)
}

Some important additional data

Location of the progressiveMauve output (xmfa_file)
Custom genome order, here as vector (genome_order)
An excel file listing all contig boundaries (df)

xmfa_file <- "/home/Drives/HDD03_06T_SDE/anna/SyntenyPlotDSMSubH2/genomes/reoriented/C_sph_genomes.xmfa"

genome_order <- c("GCF_001576595", "GCF_000012905", "GCF_001685625",
                  "GCF_012647365", "GCF_003324715", "GCF_002706325",
                  "GCF_000273405", "GCF_000269625", "DSM158",
                  "GCF_049434525", "GCF_000212605", 'GCF_052246835',
                  "GCF_000021005", "GCF_003846365", "GCF_003846425",
                  "GCF_000015985", "GCF_003846385", "GCF_003846405")

df <- read_excel("/home/anna/R/Genome_Overview_Rsph.xlsx")

Reading the `xmfa` file

To simplify the visualization only blocks located in two or more assemblies are processed. Therefor we removed all unique blocks (xmfa_data_short).

xmfa_data <- read_xmfa(xmfa_file)

xmfa_data_short <- xmfa_data[(1:1683),]

To plot the different data together, we need to reorder our dataframes (df and xmfa_data_short) after the genome_order vector

df$Genome_Name <- factor(
  df$Genome_Name,
  levels = genome_order
)

xmfa_data_short$Genome_Name <- factor(
  xmfa_data_short$Genome_Name,
  levels = genome_order
)

Plotting the data with ggplot2

ggplot(xmfa_data_short, 
       aes(y = Genome_Name)
  ) +
  geom_rect( #LCB from progressiveMauve
    aes(xmin = Start, xmax = End,
        ymin = as.numeric(Genome_Name) - 0.3,
        ymax = as.numeric(Genome_Name) + 0.3,
        fill = factor(Block))
  ) +
  geom_segment( #Contig boundaries
    data = df,
    aes(
      x = Contig_Boundaries_cum, 
      xend = Contig_Boundaries_cum,
      y = as.numeric(Genome_Name) - 0.4,
      yend = as.numeric(Genome_Name) + 0.4
    ),
    color = "grey20", 
    linetype = "dashed", 
    linewidth = 0.3
  ) +
  scale_y_discrete(limits = genome_order) + # custom genome order
  guides(fill = "none")+ #remove legend
  theme_minimal()

#Finetuning of the figure is made with inkscape

Generating a phylogeny tree using a distance matrix from mash

Loading the required packages

library(ape)        # version 5.7-1


Attaching package: 'ape'

The following object is masked from 'package:dplyr':

    where

The following object is masked from 'package:circlize':

    degree

The following object is masked from 'package:Biostrings':

    complement

library(tidyverse)  # version 2.0.0

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::%within%() masks IRanges::%within%()
✖ dplyr::between()      masks data.table::between()
✖ readr::col_factor()   masks scales::col_factor()
✖ dplyr::collapse()     masks Biostrings::collapse(), IRanges::collapse()
✖ dplyr::combine()      masks BiocGenerics::combine()
✖ purrr::compact()      masks XVector::compact()
✖ dplyr::desc()         masks IRanges::desc()
✖ purrr::discard()      masks scales::discard()
✖ tidyr::expand()       masks S4Vectors::expand()
✖ dplyr::filter()       masks stats::filter()
✖ dplyr::first()        masks data.table::first(), S4Vectors::first()
✖ lubridate::hour()     masks data.table::hour()
✖ lubridate::isoweek()  masks data.table::isoweek()
✖ dplyr::lag()          masks stats::lag()
✖ dplyr::last()         masks data.table::last()
✖ lubridate::mday()     masks data.table::mday()
✖ lubridate::minute()   masks data.table::minute()
✖ lubridate::month()    masks data.table::month()
✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
✖ lubridate::quarter()  masks data.table::quarter()
✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
✖ dplyr::rename()       masks S4Vectors::rename()
✖ lubridate::second()   masks data.table::second(), S4Vectors::second()
✖ lubridate::second<-() masks S4Vectors::second<-()
✖ dplyr::slice()        masks XVector::slice(), IRanges::slice()
✖ purrr::transpose()    masks data.table::transpose()
✖ lubridate::wday()     masks data.table::wday()
✖ lubridate::week()     masks data.table::week()
✖ ape::where()          masks dplyr::where()
✖ lubridate::yday()     masks data.table::yday()
✖ lubridate::year()     masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(reshape2)   # version 1.4.4


Attaching package: 'reshape2'

The following object is masked from 'package:tidyr':

    smiths

The following objects are masked from 'package:data.table':

    dcast, melt

library(ggplot2)    # version 4.0.0
library(ggtree)     # version 3.99.0

ggtree v3.99.0 Learn more at https://yulab-smu.top/contribution-tree-data/

Please cite:

Shuangbin Xu, Lin Li, Xiao Luo, Meijun Chen, Wenli Tang, Li Zhan, Zehan
Dai, Tommy T. Lam, Yi Guan, Guangchuang Yu. Ggtree: A serialized data
object for visualization of a phylogenetic tree and annotation data.
iMeta 2022, 1(4):e56. doi:10.1002/imt2.56

Attaching package: 'ggtree'

The following object is masked from 'package:tidyr':

    expand

The following object is masked from 'package:ape':

    rotate

The following object is masked from 'package:Biostrings':

    collapse

The following object is masked from 'package:IRanges':

    collapse

The following object is masked from 'package:S4Vectors':

    expand

Loading the output from mash:

It should be a comma-separated file with the following columns:

Column 1: Name of the first genome
Column 2: Name of the second genome
Column 3: Distance between the genomes

Then we calculated the distance matrix

# Mash output: col1=genome1, col2=genome2, col3=distance
dist_tab <- read.table("/home/Drives/HDD03_06T_SDE/anna/SyntenyPlotDSMSubH2/mash/dist.tab", 
                       header=FALSE, 
                       stringsAsFactors = FALSE)

# Calculate distance matrix
dist_mat <- reshape2::acast(dist_tab, V1~V2, value.var="V3", fill=0)
dist_mat <- as.dist(dist_mat)

We used the distance matrix to calculate the neighbor-joining tree and plotted this tree using ggtree

# Neighbor-Joining tree calculation
tree <- nj(dist_mat)

#set global option to ignore negative edges
options(ignore.negative.edge=TRUE)
#plotting the tree using ggtree
 ggtree(tree) + 
  geom_tiplab(align = T, as_ylab = T) +
  geom_hilight(node = c(3,4), color = "lightgreen")

Scale for y is already present.
Adding another scale for y, which will replace the existing scale.

Circos plot of the assemblies using circlize

Loading the required packages

Import the genomes and the genome annotations

Tidying the sequence names (remove file extension) and rename the chromosomes

Create a sliding window (for the calculation of gene densities and GC content) of size 10,000 bp

Calculate the gene densities

Calculate the GC content

Data for the phage sequences (PHASTER web-server)

Read the coordinates from a MUMMer coordinates file (for the synteny track)

Color palette for the links (npg palette from the ggsci package) and for the other tracks

Prepare the new chromosome names

Circos plot

Synteny and phylogenetic tree of 18 complete R. sphaeroides genomes

Visualization of progressiveMauve Output

Loading required packages:

Function for parsing the progressiveMauve output

Some important additional data

Reading the xmfa file

Plotting the data with ggplot2

Generating a phylogeny tree using a distance matrix from mash

Loading the required packages

Loading the output from mash:

Then we calculated the distance matrix

Color palette for the links (npg palette from the `ggsci` package) and for the other tracks

Function for parsing the `progressiveMauve` output

Reading the `xmfa` file