In this exercise, you will simulate a population and analyse its genetic variation in three steps and one extra step:
For each step, we have given you instructions with an
AlphaSimR
template code to complete (replace ??? with an
appropriate code).
This exercise deliberately goes beyond the material we have covered up to now. To help you on this pathway, we indicate which functions should be used and we point to their documentation. Embracing this growth mindset is important for mastering AlphaSimR and combining it with other R functionality.
# Clean the working environment
rm(list = ls())
# Set the default plot layout
par(mfrow = c(1, 1))
library(AlphaSimR)
## Loading required package: R6
# Use the following parameters:
# * 20 individuals
# * 3 chromosomes
# * 100 segregating loci (sites) per chromosome
# * MAIZE species
# Note that maize has 10 chromosome pairs, but to speed up this session
# we will only simulate 3 chromosome pairs.
# If you get errors or get lost, remember to use help(runMacs)
founderGenomes = runMacs(nInd = 20,
nChr = 3,
segSites = 100,
species = "MAIZE")
# Set the global simulation parameters object SP from founding genomes
SP = SimParam$new(founderGenomes)
# Skip sex allocation in this example because maize individuals have "male" and
# "female" reproductive organs).
# Now create a population of individuals from founding genomes
maizePop = newPop(founderGenomes)
# Access haplotypes of maize plants in the maizePop object
maizeHaplo = pullSegSiteHaplo(maizePop)
# Check the first 10 haplotypes at the first 10 loci
maizeHaplo[1:10, 1:10]
## 1_1 1_2 1_3 1_4 1_5 1_6 1_7 1_8 1_9 1_10
## 1_1 0 0 1 0 1 0 0 1 0 0
## 1_2 0 0 0 0 0 0 1 1 1 1
## 2_1 0 0 1 0 1 0 0 1 0 0
## 2_2 0 0 0 0 0 0 1 1 1 1
## 3_1 0 0 0 0 0 0 1 1 1 1
## 3_2 0 0 0 0 0 0 0 1 0 0
## 4_1 0 0 0 0 0 0 1 0 1 0
## 4_2 0 0 0 0 1 0 0 1 0 0
## 5_1 1 1 0 0 0 1 0 1 0 0
## 5_2 0 0 0 0 0 1 0 1 0 0
# Check the first 5 genotypes at the first 10 loci
maizeGeno = pullSegSiteGeno(maizePop)
maizeGeno[1:5, 1:10]
## 1_1 1_2 1_3 1_4 1_5 1_6 1_7 1_8 1_9 1_10
## 1 0 0 1 0 1 0 1 2 1 1
## 2 0 0 1 0 1 0 1 2 1 1
## 3 0 0 0 0 0 0 1 2 1 1
## 4 0 0 0 0 1 0 1 1 1 0
## 5 1 1 0 0 0 2 0 2 0 0
To demonstrate the joint use of AlphaSimR and base R functionality, we will now analyse the simulated genomes by calculating two common statistics: allele frequency and correlation between allele dosages at different loci. We have not covered these topics previously, so, this is a guided exercise;
Allele frequency describes the proportion of mutations at a locus.
# Calculate frequency of mutant alleles in the haplotypes at each locus using the
# colMeans() function
alleleFreq = colMeans(maizeHaplo)
# Summarise allele frequencies
print(alleleFreq) # alleleFreq is a vector of frequencies
## 1_1 1_2 1_3 1_4 1_5 1_6 1_7 1_8 1_9 1_10 1_11 1_12 1_13
## 0.100 0.025 0.275 0.050 0.375 0.225 0.175 0.875 0.350 0.200 0.225 0.500 0.075
## 1_14 1_15 1_16 1_17 1_18 1_19 1_20 1_21 1_22 1_23 1_24 1_25 1_26
## 0.050 0.525 0.050 0.550 0.425 0.525 0.475 0.225 0.175 0.200 0.050 0.550 0.775
## 1_27 1_28 1_29 1_30 1_31 1_32 1_33 1_34 1_35 1_36 1_37 1_38 1_39
## 0.300 0.175 0.650 0.875 0.225 0.350 0.375 0.150 0.150 0.350 0.125 0.525 0.275
## 1_40 1_41 1_42 1_43 1_44 1_45 1_46 1_47 1_48 1_49 1_50 1_51 1_52
## 0.625 0.900 0.500 0.425 0.350 0.175 0.925 0.225 0.250 0.050 0.075 0.075 0.325
## 1_53 1_54 1_55 1_56 1_57 1_58 1_59 1_60 1_61 1_62 1_63 1_64 1_65
## 0.800 0.175 0.075 0.725 0.725 0.575 0.350 0.025 0.225 0.650 0.575 0.225 0.575
## 1_66 1_67 1_68 1_69 1_70 1_71 1_72 1_73 1_74 1_75 1_76 1_77 1_78
## 0.800 0.325 0.500 0.525 0.325 0.075 0.575 0.225 0.125 0.525 0.125 0.150 0.050
## 1_79 1_80 1_81 1_82 1_83 1_84 1_85 1_86 1_87 1_88 1_89 1_90 1_91
## 0.100 0.700 0.700 0.025 0.075 0.175 0.150 0.150 0.900 0.075 0.450 0.525 0.725
## 1_92 1_93 1_94 1_95 1_96 1_97 1_98 1_99 1_100 2_1 2_2 2_3 2_4
## 0.475 0.150 0.350 0.125 0.625 0.025 0.450 0.750 0.650 0.150 0.375 0.100 0.900
## 2_5 2_6 2_7 2_8 2_9 2_10 2_11 2_12 2_13 2_14 2_15 2_16 2_17
## 0.475 0.950 0.250 0.425 0.100 0.025 0.550 0.100 0.475 0.125 0.375 0.125 0.750
## 2_18 2_19 2_20 2_21 2_22 2_23 2_24 2_25 2_26 2_27 2_28 2_29 2_30
## 0.450 0.850 0.200 0.550 0.200 0.400 0.225 0.950 0.700 0.625 0.075 0.275 0.725
## 2_31 2_32 2_33 2_34 2_35 2_36 2_37 2_38 2_39 2_40 2_41 2_42 2_43
## 0.375 0.100 0.550 0.125 0.400 0.250 0.275 0.750 0.025 0.075 0.900 0.025 0.050
## 2_44 2_45 2_46 2_47 2_48 2_49 2_50 2_51 2_52 2_53 2_54 2_55 2_56
## 0.200 0.575 0.525 0.700 0.050 0.425 0.400 0.275 0.175 0.925 0.525 0.375 0.350
## 2_57 2_58 2_59 2_60 2_61 2_62 2_63 2_64 2_65 2_66 2_67 2_68 2_69
## 0.125 0.950 0.825 0.075 0.075 0.675 0.575 0.225 0.750 0.075 0.925 0.100 0.325
## 2_70 2_71 2_72 2_73 2_74 2_75 2_76 2_77 2_78 2_79 2_80 2_81 2_82
## 0.650 0.100 0.375 0.500 0.325 0.150 0.625 0.975 0.350 0.400 0.075 0.800 0.525
## 2_83 2_84 2_85 2_86 2_87 2_88 2_89 2_90 2_91 2_92 2_93 2_94 2_95
## 0.200 0.700 0.275 0.325 0.150 0.550 0.325 0.500 0.050 0.575 0.600 0.025 0.100
## 2_96 2_97 2_98 2_99 2_100 3_1 3_2 3_3 3_4 3_5 3_6 3_7 3_8
## 0.400 0.025 0.250 0.875 0.075 0.550 0.900 0.675 0.525 0.225 0.125 0.325 0.275
## 3_9 3_10 3_11 3_12 3_13 3_14 3_15 3_16 3_17 3_18 3_19 3_20 3_21
## 0.125 0.400 0.175 0.575 0.575 0.150 0.175 0.550 0.475 0.425 0.875 0.625 0.525
## 3_22 3_23 3_24 3_25 3_26 3_27 3_28 3_29 3_30 3_31 3_32 3_33 3_34
## 0.200 0.525 0.050 0.850 0.325 0.125 0.075 0.425 0.275 0.225 0.200 0.525 0.125
## 3_35 3_36 3_37 3_38 3_39 3_40 3_41 3_42 3_43 3_44 3_45 3_46 3_47
## 0.375 0.375 0.975 0.525 0.525 0.200 0.425 0.250 0.075 0.800 0.500 0.075 0.025
## 3_48 3_49 3_50 3_51 3_52 3_53 3_54 3_55 3_56 3_57 3_58 3_59 3_60
## 0.500 0.925 0.275 0.275 0.525 0.900 0.150 0.175 0.025 0.400 0.850 0.750 0.025
## 3_61 3_62 3_63 3_64 3_65 3_66 3_67 3_68 3_69 3_70 3_71 3_72 3_73
## 0.300 0.450 0.275 0.175 0.425 0.325 0.375 0.775 0.250 0.450 0.950 0.900 0.700
## 3_74 3_75 3_76 3_77 3_78 3_79 3_80 3_81 3_82 3_83 3_84 3_85 3_86
## 0.075 0.350 0.650 0.925 0.125 0.100 0.550 0.375 0.925 0.100 0.525 0.900 0.625
## 3_87 3_88 3_89 3_90 3_91 3_92 3_93 3_94 3_95 3_96 3_97 3_98 3_99
## 0.200 0.525 0.075 0.275 0.050 0.075 0.875 0.525 0.050 0.025 0.175 0.600 0.525
## 3_100
## 0.275
hist(alleleFreq, xlab = "Allele frequency", main = "")
# We usually see more loci with lower allele frequencies indicating younger
# mutations or mutations that were selected against.
Correlation between allele dosages at different loci describes the extent of linkage between the loci. We expect higher absolute correlation (away from 0) between nearby loci because they are physically linked and there is a small probability of recombination between nearby loci. As the physical distance between loci increases, probability of recombination increases, which in turn reduces the correlations between allele dosages at distant loci (more on recombination in other sessions).
# Calculate correlations between allele dosages at the haplotype loci
lociCor = cor(maizeHaplo)
# Inspect some correlations
str(lociCor) # lociCor is a matrix of correlations
## num [1:300, 1:300] 1 0.4804 -0.0187 -0.0765 -0.2582 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:300] "1_1" "1_2" "1_3" "1_4" ...
## ..$ : chr [1:300] "1_1" "1_2" "1_3" "1_4" ...
lociCor[1:5, 1:5]
## 1_1 1_2 1_3 1_4 1_5
## 1_1 1.00000000 0.48038446 -0.01866308 -0.07647191 -0.2581989
## 1_2 0.48038446 1.00000000 -0.09862001 -0.03673592 -0.1240347
## 1_3 -0.01866308 -0.09862001 1.00000000 -0.14129297 0.6794490
## 1_4 -0.07647191 -0.03673592 -0.14129297 1.00000000 -0.1777047
## 1_5 -0.25819889 -0.12403473 0.67944904 -0.17770466 1.0000000
# Summarise the correlations
hist(lociCor, xlab = "Correlation", main = "")
# Correlations are distributed around zero because most pairs of loci are far
# apart, but there are some pairs of loci with larger correlations
# Plot correlations between the first 100 loci (these span chromosome 1)
lociCorChr1 = lociCor[1:100, 1:100]
image(lociCorChr1,
xlab = "Relative locus position", ylab = "Relative locus position")
# Focusing on just one chromosome we can see elevated correlations (darker colour)
# around the diagonal of the matrix - this is correlation between nearby loci
# EXTRA: Plot the correlations between loci as a function of genetic distance
# between the loci (no need to change any of the code below - just run it!).
lociCorChr1Val = c(lociCorChr1[lower.tri(lociCorChr1)])
lociCorChr1Dis = outer(SP$genMap[[1]], SP$genMap[[1]], FUN = "-")
lociCorChr1Dis = lociCorChr1Dis[lower.tri(lociCorChr1Dis)]
plot(x = lociCorChr1Dis, y = abs(lociCorChr1Val),
xlab = "Distance between loci", ylab = "Correlation")
lines(lowess(x = lociCorChr1Dis, y = abs(lociCorChr1Val)), col = "red")
# We plot absolute value of correlations to focus only on the magnitude of values.
# Black points show correlations between loci, while the red line shows smoothed
# average correlation. There are too many black points though, so we can't see
# the data clearly.
# EXTRA: A clearer way to present such data is to use smoothScatter()
smoothScatter(x = lociCorChr1Dis, y = abs(lociCorChr1Val),
xlab = "Distance between loci", ylab = "Correlation")
lines(lowess(x = lociCorChr1Dis, y = abs(lociCorChr1Val)), col = "red")
# Blue colour represents smoothed density of our data points - the darker the
# colour the more data points there are in one area. We use this plotting method
# when we have many data points that overlay each other. See help(smoothScatter)
# for more details.
#
# Have you expected higher correlation between nearby loci?
# You can repeat the simulation with a larger number of segregating loci, say
# at least 1000 per chromosome, to capture more nearby loci. Also, you should
# revisit this analysis with a population under selection.