wget https://www.encodeproject.org/files/ENCFF001NQP/@@download/ENCFF001NQP.fastq.gz
wget https://www.encodeproject.org/files/ENCFF001NQQ/@@download/ENCFF001NQQ.fastq.gz
wget https://www.encodeproject.org/files/ENCFF001NCH/@@download/ENCFF001NCH.fastq.gz
wget https://www.encodeproject.org/files/ENCFF001NCF/@@download/ENCFF001NCF.fastq.gz
The files were renamed:
mv ENCFF001NQP.fastq.gz rep1.fastq.gz
mv ENCFF001NQQ.fastq.gz rep2.fastq.gz
mv ENCFF001NCH.fastq.gz input1.fastq.gz
mv ENCFF001NCF.fastq.gz input2.fastq.gz
fastq.gz files were downloaded into bioconductor and subsampled using the FastqSampler() method from the ShortRead package
fqSample <- FastqSampler("/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/raw_data/rep1.fastq.gz", n = 10^6)
fastq <- yield(fqSample)
fastq
class: ShortReadQ
length: 1000000 reads; width: 36 cycles
36 cycles means that the read length is 36 nt.
readSequences <- sread(fastq)
readQuality <- quality(fastq)
readIDs <- id(fastq)
readSequences
DNAStringSet object of length 1000000:
width seq
[1] 36 ACAGTAGTCTTCCGGTCAAGAGCTACACGGGTTGTG
[2] 36 TGTGTGTATGTCTGCCTGCCAGCAGAGGGAACCAGA
[3] 36 CTGATTCAAACTCTGTTCTGTCAGCTTCTGACTGAA
[4] 36 AGATACCATATAATGAACCCACGAAAACAATCCGGT
[5] 36 CTAAAATAACCAAAGGACTTGTCACATCTATGTAAA
... ... ...
[999996] 36 GTTTATAAACAGCATTGTACGTTACCCAGGACCTGC
[999997] 36 AAACAGGAGGAAGGGAGGATTGAGGGACCGCACAGA
[999998] 36 CAACAGGTTTGATCTTCACGCTCGGCTTATTCTTGC
[999999] 36 GTGACTTTACCCGCTTTATCTCTGGACTCCAGAGTT
[1000000] 36 TTTCCTACTTTACAGCATAGATGATGAAAAACACAA
library(ggplot2)
library(repr)
options(repr.plot.width=15, repr.plot.height=8)
toPlot <- data.frame(ReadQ = readQualities)
ggplot(toPlot, aes(x = ReadQ)) + geom_histogram(bins=30) + theme_minimal()+
theme(axis.text.x = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
axis.text.y = element_text(color = "black", size = 20, angle = 0, hjust = 1, vjust = 0, face = "plain"),
axis.title.x = element_text(color = "black", size = 20, angle = 0, hjust = .5, vjust = 0, face = "plain"),
axis.title.y = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
plot.title = element_text(size=22, hjust = .5, vjust = .5, face="bold"))+
ylim(0, 220000)+ggtitle("Per sequence quality score")
readSequences_AlpbyCycle <- alphabetByCycle(readSequences)
readSequences_AlpbyCycle[1:4, 1:10]
A matrix: 4 × 10 of type int
A 306954 328614 282305 276432 276920 276570 278020 281716 271557 278703
C 178805 174129 206175 224071 210374 220172 213623 217461 220140 222889
G 243413 246240 228349 211866 212787 214306 210685 210163 213540 209952
T 266665 247611 278246 278121 290270 279718 288472 279555 283949 277603
AFreq <- readSequences_AlpbyCycle["A", ]
CFreq <- readSequences_AlpbyCycle["C", ]
GFreq <- readSequences_AlpbyCycle["G", ]
TFreq <- readSequences_AlpbyCycle["T", ]
toPlot <- data.frame(Count = c(AFreq, CFreq, GFreq, TFreq), Cycle = rep(1:36, 4),
Base = rep(c("A", "C", "G", "T"), each = 36))
ggplot(toPlot, aes(y = Count, x = Cycle, colour = Base)) + geom_line() + theme_bw()+
theme(axis.text.x = element_text(color = "black", size = 20, angle =0, hjust = .5, vjust = .5, face = "plain"),
axis.text.y = element_text(color = "black", size = 20, angle = 0, hjust = 1, vjust = 0, face = "plain"),
axis.title.x = element_text(color = "black", size = 20, angle = 0, hjust = .5, vjust = 0, face = "plain"),
axis.title.y = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
plot.title = element_text(size=22, hjust = .5, vjust = .5, face="bold"))+
theme(
legend.text = element_text(size = 16),
legend.title = element_text(size = 18))+ggtitle("Per base sequence content")
options(repr.plot.width=14, repr.plot.height=10)
graphics::boxplot(qualAsMatrix[1:1000, ], xlab="position", ylab="quality score", main="Per base quality score", cex.axis=1.5, cex.lab=1.5, cex.main=1.75)
files <- list.files(path="/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data",pattern="*.fastq.gz")
files_filt<- paste("filt", files, sep = "_")
path_filt<-paste0("/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/", files_filt)
path_filt
'/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/filt_input1.fastq.gz'
'/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/filt_input2.fastq.gz'
'/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/filt_rep1.fastq.gz'
'/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/filt_rep2.fastq.gz'
Next, list of generators were created (FastqStreamer objects are generators)
mylist <- lapply(paste0("/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/",files), function(x) FastqStreamer(x, n = 1e+05))
mylist
[[1]]
class: FastqStreamer
file: input1.fastq.gz
status: n=100000 current=0 added=0 total=0 buffer=0
[[2]]
class: FastqStreamer
file: input2.fastq.gz
status: n=100000 current=0 added=0 total=0 buffer=0
[[3]]
class: FastqStreamer
file: rep1.fastq.gz
status: n=100000 current=0 added=0 total=0 buffer=0
[[4]]
class: FastqStreamer
file: rep2.fastq.gz
status: n=100000 current=0 added=0 total=0 buffer=0
Filtering
k=1
for (i in mylist){
TotalReads <- 0
TotalReadsFilt <- 0
while (length(fq <- yield(i)) > 0) {
TotalReads <- TotalReads + length(fq)
filt1 <- fq[alphabetScore(fq) > 300]
filt2 <- filt1[alphabetFrequency(sread(filt1))[, "N"] < 10]
TotalReadsFilt <- TotalReadsFilt + length(filt2)
writeFastq(filt2, path_filt[k], mode = "a")
k=k+1
}
}
index=/Users/alexeyefanov/Desktop/Programming/Bioconductor/refgenomes/mouse_index
filt=/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/filtered
reference=/Users/alexeyefanov/Desktop/Programming/Bioconductor/refgenomes/BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa
sam=/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/sam
bowtie2-build -f $reference $index/index
gunzip -k $filt/*.gz
cd $sam
for sample in $filt/*.fastq; do
base=$(basename $sample .fastq)
bowtie2 -x $index/index -fr -q $sample -p 12 -S $base.sam
done
for sample in $sam/*.bam;do
base=$(basename $sample .bam)
samtools view -S -b -o $base.bam $base.sam
samtools sort -o ${base}_sorted.bam ${base}.bam
samtools index ${base}_sorted.bam
done
input1=/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/sam/filt_input1_sorted.bam
input2=/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/sam/filt_input2_sorted.bam
rep1=/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/sam/filt_rep1_sorted.bam
rep2=/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/sam/filt_rep2_sorted.bam
peak=/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/peaks
#conda activate MACS
macs2 callpeak -t $rep1 $rep2 -c $input1 $input2 --name mel --outdir $peak
library(repr)
options(repr.plot.width=9, repr.plot.height=15)
mappedReads <- idxstatsBam("/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/sam/filt_rep1_sorted.bam")
TotalMapped <- sum(mappedReads[, "mapped"])
ggplot(mappedReads, aes(x = seqnames, y = mapped)) + geom_bar(stat = "identity") +
coord_flip()+
theme(axis.text.x = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
axis.text.y = element_text(color = "black", size = 20, angle = 0, hjust = 1, vjust = 0, face = "plain"),
axis.title.x = element_text(color = "black", size = 15, angle = 0, hjust = .5, vjust = 0, face = "plain"),
axis.title.y = element_text(color = "black", size = 15, angle = 90, hjust = .5, vjust = .5, face = "plain"))
BigWig <- coverage("/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/sam/filt_rep1_sorted.bam", weight = (10^6)/TotalMapped)
library(rtracklayer)
export.bw(BigWig, con = "/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/sam/filt_rep1_weighted1.bw")
macsPeaks <- "/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/peaks/mel_peaks.xls"
macsPeaks_DF <- read.delim(macsPeaks, comment.char = "#")
macsPeaks_DF[1:2, ]
A data.frame: 2 × 10
chr start end length abs_summit pileup X.log10.pvalue. fold_enrichment X.log10.qvalue. name
1 chr1 4785344 4785933 590 4785600 29.56 14.49350 5.46755 11.17909 mel_peak_1
2 chr1 5083013 5083207 195 5083083 35.06 11.00655 3.60638 7.87681 mel_peak_2
library(GenomicRanges)
macsPeaks_GR <- GRanges(seqnames = macsPeaks_DF[, "chr"], IRanges(macsPeaks_DF[,
"start"], macsPeaks_DF[, "end"]))
macsPeaks_GR
GRanges object with 26241 ranges and 0 metadata columns:
seqnames ranges strand
[1] chr1 4785344-4785933 *
[2] chr1 5083013-5083207 *
[3] chr1 5677542-5677817 *
[4] chr1 6012559-6012715 *
[5] chr1 7397544-7398143 *
... ... ... ...
[26237] chrY 58550402-58550723 *
[26238] chrY 60505268-60505555 *
[26239] chrY 89932367-89932501 *
[26240] chrY 90818621-90818899 *
[26241] chrY 90824605-90824835 *
-------
seqinfo: 21 sequences from an unspecified genome; no seqlengths
table(seqnames(macsPeaks_GR))
chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr3
1506 1332 2635 1018 1007 790 1167 852 1304 700 961 1924 1357
chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY
1567 1840 1261 1602 1197 1489 716 12
Mice have 20 chromosomes in their haploid genome.
mcols(macsPeaks_GR) <- macsPeaks_DF[, c("abs_summit", "fold_enrichment")]
macsPeaks_GR
toBlkList <- "/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/blacklist/ENCFF547METmm10.bed"
blkList <- import.bed(toBlkList)
macsPeaks_GR <- macsPeaks_GR[!macsPeaks_GR %over% blkList]
macsPeaks_GR
GRanges object with 26237 ranges and 2 metadata columns:
seqnames ranges strand | abs_summit fold_enrichment
|
[1] chr1 4785344-4785933 * | 4785600 5.46755
[2] chr1 5083013-5083207 * | 5083083 3.60638
[3] chr1 5677542-5677817 * | 5677571 5.65644
[4] chr1 6012559-6012715 * | 6012601 5.69169
[5] chr1 7397544-7398143 * | 7397789 8.75376
... ... ... ... . ... ...
[26233] chrY 58550402-58550723 * | 58550543 17.33837
[26234] chrY 60505268-60505555 * | 60505432 8.35447
[26235] chrY 89932367-89932501 * | 89932468 5.10398
[26236] chrY 90818621-90818899 * | 90818649 3.21887
[26237] chrY 90824605-90824835 * | 90824624 2.55565
-------
seqinfo: 21 sequences from an unspecified genome; no seqlengths
peakAnno <- annotatePeak(macsPeaks_GR, tssRegion = c(-500, 500), TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene,
annoDb = "org.Mm.eg.db")
peakAnno
Annotated peaks generated by ChIPseeker
26237/26237 peaks were annotated
Genomic Annotation Summary:
Feature Frequency
9 Promoter 28.9514807
4 5' UTR 0.3163471
3 3' UTR 1.9781225
1 1st Exon 2.0048024
7 Other Exon 2.5803255
2 1st Intron 15.5772383
8 Other Intron 20.1433091
6 Downstream (<=300) 0.2210619
5 Distal Intergenic 28.2273126
peakAnno_GR <- as.GRanges(peakAnno)
peakAnno_DF <- as.data.frame(peakAnno)
options(repr.plot.width=15, repr.plot.height=6)
plotAnnoBar(peakAnno)+
theme(axis.text.x = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
axis.text.y = element_text(color = "black", size = 20, angle = 0, hjust = 1, vjust = 0, face = "plain"),
axis.title.x = element_text(color = "black", size = 20, angle = 0, hjust = .5, vjust = 0, face = "plain"),
axis.title.y = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
plot.title = element_text(size=20, hjust = .5, vjust = .5, face="bold"))
plotDistToTSS(peakAnno)+theme(axis.text.x = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
axis.text.y = element_text(color = "black", size = 20, angle = 0, hjust = 1, vjust = 0, face = "plain"),
axis.title.x = element_text(color = "black", size = 20, angle = 0, hjust = .5, vjust = 0, face = "plain"),
axis.title.y = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
plot.title = element_text(size=20, hjust = .5, vjust = .5, face="bold"))
upsetplot(peakAnno, vennpie = F)+theme(axis.text.x = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
axis.text.y = element_text(color = "black", size = 20, angle = 0, hjust = 1, vjust = 0, face = "plain"),
axis.title.x = element_text(color = "black", size = 20, angle = 0, hjust = .5, vjust = 0, face = "plain"),
axis.title.y = element_text(color = "black", size = 20, angle = 90, hjust = .5, vjust = .5, face = "plain"),
plot.title = element_text(size=20, hjust = .5, vjust = .5, face="bold"))+ggtitle("Upset plot of peak annotations")
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(ChIPseeker)
peakAnno <- annotatePeak(macsPeaks_GR, tssRegion=c(-1000, 1000),
TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene,
annoDb="org.Mm.eg.db")
macsSummits_GR <- GRanges(seqnames(macsPeaks_GR), IRanges(macsPeaks_GR$abs_summit,
macsPeaks_GR$abs_summit), score = macsPeaks_GR$fold_enrichment)
macsSummits_GR <- resize(macsSummits_GR, 100, fix = "center")
macsSummits_GR
GRanges object with 26237 ranges and 1 metadata column:
seqnames ranges strand | score
|
[1] chr1 4785550-4785649 * | 5.46755
[2] chr1 5083033-5083132 * | 3.60638
[3] chr1 5677521-5677620 * | 5.65644
[4] chr1 6012551-6012650 * | 5.69169
[5] chr1 7397739-7397838 * | 8.75376
... ... ... ... . ...
[26233] chrY 58550493-58550592 * | 17.33837
[26234] chrY 60505382-60505481 * | 8.35447
[26235] chrY 89932418-89932517 * | 5.10398
[26236] chrY 90818599-90818698 * | 3.21887
[26237] chrY 90824574-90824673 * | 2.55565
-------
seqinfo: 21 sequences from an unspecified genome; no seqlengths
peaksSequences <- getSeq(BSgenome.Mmusculus.UCSC.mm10, macsSummits_GR)
names(peaksSequences) <- paste0(seqnames(macsSummits_GR), ":", start(macsSummits_GR),
"-", end(macsSummits_GR))
peaksSequences[1:2, ]
DNAStringSet object of length 2:
width seq names
[1] 100 CCGCACCGCCGCCAAGCGTTTAC...GGACCGCAGCAGGTCCAGGCTGG chr1:4785550-4785649
[2] 100 TTTGGTGCGCGGTCCAGCAGTTT...CAGACGCTGCTGTCACTTCGTAG chr1:5083033-5083132
writeXStringSet(peaksSequences, file ="/Users/alexeyefanov/Desktop/Programming/Bioconductor/ChiP-seq/MEL/data/meme.fa").
MEME-Chip was run into zsh shell:
meme-chip -maxw 10 -minw 5 -o results meme.fa
The top result of MEME-Chip output contains consensus myc binding site: CACGTG/CATGTG
Back to bioinformatics project page: Back to bioinformatics project page
Back to home: Back to home