cd /data/dnaseq/
mkdir reads aligned_reads results supporting_files script
raw reads will be downloaded to the reads folderSetting variables:
ref=/data/dnaseq/supporting_files/hg38/hg38.fa
wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/HG00096/sequence_read/SRR062634_1.filt.fastq.gz
wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/HG00096/sequence_read/SRR062634_2.filt.fastq.gz
Reference genome downloading:
wget -P /data/dnaseq/supporting_files/hg38/ https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
gunzip /data/dnaseq/tutorial/supporting_files/hg38/
samtools faidx /data/dnaseq/supporting_files/hg38/hg38.fa
Checking read's quality:
for sample in $reads/*.fastq.gz; do
fastqc $sample -o ${reads}/
done
Reference indexing
bwa index ${ref}
Align reads to reference genome
bwa mem -t 32 -R "@RG\tID:SRR062634\tPL:ILLUMINA\tSM:SRR062634" ${ref} ${reads}/SRR062634_1.filt.fastq.gz ${reads}/SRR062634_2.filt.fastq.gz > ${aligned_reads}/SRR062634.paired.sam
samtools view -Shb -o ${aligned_reads}/SRR062634.paired.bam ${aligned_reads}/SRR062634.paired.sam
Mark Duplicates and Sort
java -jar $PICARD SortSam \
CREATE_INDEX=true \
INPUT=${aligned_reads}/SRR062634.paired.bam \
OUTPUT=${aligned_reads}/SRR062634_sorted_dedup_reads.bam \
SORT_ORDER=coordinate \
VALIDATION_STRINGENCY=STRICT
java -jar $PICARD MarkDuplicates \
CREATE_INDEX=true \
INPUT=${aligned_reads}/SRR062634_sorted_dedup_reads.bam \
VALIDATION_STRINGENCY=STRICT
gatk BaseRecalibrator -I ${aligned_reads}/SRR062634_sorted_dedup_reads.bam -R ${ref} --known-sites ${known_sites} -O ${data}/recal_data.table
2. Apply the model to adjust the base quality scores
gatk ApplyBQSR -I ${aligned_reads}/SRR062634_sorted_dedup_reads.bam -R ${ref} --bqsr-recal-file {$data}/recal_data.table -O ${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam
gatk CollectAlignmentSummaryMetrics R=${ref} I=${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam O=${aligned_reads}/alignment_metrics.txt
gatk CollectInsertSizeMetrics INPUT=${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam OUTPUT=${aligned_reads}/insert_size_metrics.txt HISTOGRAM_FILE=${aligned_reads}/insert_size_histogram.pdf
gatk HaplotypeCaller -R ${ref} -I ${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam -O ${results}/raw_variants.vcf
Extract SNPs & INDELS
gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf --select-type SNP -O ${results}/raw_snps.vcf
gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf --select-type INDEL -O ${results}/raw_indels.vcf
Filter SNPs
gatk VariantFiltration \
-R ${ref} \
-V ${results}/raw_snps.vcf \
-O ${results}/filtered_snps.vcf \
-filter-name "QD_filter" -filter "QD < 2.0" \
-filter-name "FS_filter" -filter "FS > 60.0" \
-filter-name "MQ_filter" -filter "MQ < 40.0" \
-filter-name "SOR_filter" -filter "SOR > 4.0" \
-filter-name "MQRankSum_filter" -filter "MQRankSum < -12.5" \
-filter-name "ReadPosRankSum_filter" -filter "ReadPosRankSum < -8.0" \
-genotype-filter-expression "DP < 10" \
-genotype-filter-name "DP_filter" \
-genotype-filter-expression "GQ < 10" \
-genotype-filter-name "GQ_filter"
Filter INDELs
gatk VariantFiltration \
-R ${ref} \
-V ${results}/raw_indels.vcf \
-O ${results}/filtered_indels.vcf \
-filter-name "QD_filter" -filter "QD < 2.0" \
-filter-name "FS_filter" -filter "FS > 200.0" \
-filter-name "SOR_filter" -filter "SOR > 10.0" \
-genotype-filter-expression "DP < 10" \
-genotype-filter-name "DP_filter" \
-genotype-filter-expression "GQ < 10" \
-genotype-filter-name "GQ_filter"
Select Variants using the INFO field
gatk SelectVariants \
--exclude-filtered \
-V ${results}/filtered_snps.vcf \
-O ${results}/analysis-ready-snps.vcf
gatk SelectVariants \
--exclude-filtered \
-V ${results}/filtered_indels.vcf \
-O ${results}/analysis-ready-indels.vcf
The SelectVariants method can select variants using only the INFO field, not FORMAT field (genotype filter).
To select variants that passed genotype filters the following code is used:
cd $results
cat analysis-ready-snps.vcf|grep -v -E "DP_filter|GQ_filter" > analysis-ready-snps-filteredGT.vcf
cat analysis-ready-indels.vcf| grep -v -E "DP_filter|GQ_filter" > analysis-ready-indels-filteredGT.vcf
Annotation of SNPs and INDELs using Fancotator
gatk Funcotator \
--variant ${results}/analysis-ready-snps-filteredGT.vcf \
--reference ${ref} \
--ref-version hg38 \
--data-sources-path /data/dnaseq/tutorial/datasources/funcotator_dataSources.v1.7.20200521g \
--output ${results}/analysis-ready-snps-filteredGT-functotated.vcf \
--output-file-format VCF
gatk Funcotator \
--variant ${results}/analysis-ready-indels-filteredGT.vcf \
--reference ${ref} \
--ref-version hg38 \
--data-sources-path /data/dnaseq/tutorial/datasources/funcotator_dataSources.v1.7.20200521g \
--output ${results}/analysis-ready-indels-filteredGT-functotated.vcf \
--output-file-format VCF
Extract fields from a VCF file to a tab-delimited table
gatk VariantsToTable \
-V ${results}/analysis-ready-snps-filteredGT-functotated.vcf -F AC -F AN -F DP -F AF -F FUNCOTATION \
-O ${results}/output_snps.table
gatk VariantsToTable \
-V ${results}/analysis-ready-indels-filteredGT-functotated.vcf -F AC -F AN -F DP -F AF -F FUNCOTATION \
-O ${results}/output_indels.table
Extract the FUNCOTATION column from output_snps.table.
The FUNCOTATION column contains 53 annotation fields. The name of that fields can be extracted and
cat analysis-ready-snps-filteredGT-functotated.vcf | grep " Funcotation fields are: " | sed 's/##INFO=<&ID=FUNCOTATION,Number=A,Type=String,Description="Functional annotation from the Funcotator tool. Funcotation fields are: /''/g' | sed 's/|/,/g'>output_curated_variants.csv
Checking the number of fields in the FUNCOTATION column
cat output_snps.table | cut -f 5 | sed 's/|/,/g' | awk -F ',' 'NR>1{print $0}'| awk -F ',' '{print NF}'|sort| uniq -c
107 106
7304 53
There are 107 lines with 106 fields (columns) and 7304 lines with 53 columns.
Some of the FUNCOTATION column rows comtain duplicated amount of fields-106.
That will prevent the whole table from downloading to R or Python. To avoid it, first the output_curated_variants.csv file
cat output_snps.table | cut -f 5 | sed 's/|/,/g' | awk -F ',' 'NR>1{print $0}'| awk -F ',' 'NF==53{print}' >>output_curated_variants.csv
Next, duplicated fields will be removed from rows having 106 fields and amended rows will be added to
the output_curated_variants.csv file.
cat output_snps.table | cut -f 5 | sed 's/|/,/g' | awk -F ',' 'NR>1{print $0}'| awk -F ',' 'NF==106{print}' | cut -d ',' -f1-53| >>output_curated_variants.csv
Downloading the output_curated_variants.csv file into Pandas.
df=pd.read_csv("/Users/alexeyefanov/Desktop/Programming/DNA-seq/output_curated_variants.csv")
df.info()
RangeIndex: 7411 entries, 0 to 7410
Data columns (total 53 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Gencode_34_hugoSymbol 7411 non-null object
1 Gencode_34_ncbiBuild 7411 non-null object
2 Gencode_34_chromosome 7411 non-null object
3 Gencode_34_start 7411 non-null int64
4 Gencode_34_end 7411 non-null int64
5 Gencode_34_variantClassification 7411 non-null object
6 Gencode_34_secondaryVariantClassification 1 non-null object
7 Gencode_34_variantType 7359 non-null object
8 Gencode_34_refAllele 7411 non-null object
9 Gencode_34_tumorSeqAllele1 7411 non-null object
10 Gencode_34_tumorSeqAllele2 7411 non-null object
11 Gencode_34_genomeChange 7359 non-null object
12 Gencode_34_annotationTranscript 7411 non-null object
13 Gencode_34_transcriptStrand 1111 non-null object
14 Gencode_34_transcriptExon 11 non-null float64
15 Gencode_34_transcriptPos 9 non-null float64
16 Gencode_34_cDnaChange 900 non-null object
17 Gencode_34_codonChange 9 non-null object
18 Gencode_34_proteinChange 9 non-null object
19 Gencode_34_gcContent 7411 non-null float64
20 Gencode_34_referenceContext 7411 non-null object
21 Gencode_34_otherTranscripts 382 non-null object
22 ACMGLMMLof_LOF_Mechanism 0 non-null float64
23 ACMGLMMLof_Mode_of_Inheritance 0 non-null float64
24 ACMGLMMLof_Notes 0 non-null float64
25 ACMG_recommendation_Disease_Name 0 non-null float64
26 ClinVar_VCF_AF_ESP 0 non-null float64
27 ClinVar_VCF_AF_EXAC 0 non-null float64
28 ClinVar_VCF_AF_TGP 0 non-null float64
29 ClinVar_VCF_ALLELEID 4 non-null float64
30 ClinVar_VCF_CLNDISDB 4 non-null object
31 ClinVar_VCF_CLNDISDBINCL 0 non-null float64
32 ClinVar_VCF_CLNDN 4 non-null object
33 ClinVar_VCF_CLNDNINCL 0 non-null float64
34 ClinVar_VCF_CLNHGVS 4 non-null object
35 ClinVar_VCF_CLNREVSTAT 4 non-null object
36 ClinVar_VCF_CLNSIG 4 non-null object
37 ClinVar_VCF_CLNSIGCONF 0 non-null float64
38 ClinVar_VCF_CLNSIGINCL 0 non-null float64
39 ClinVar_VCF_CLNVC 4 non-null object
40 ClinVar_VCF_CLNVCSO 4 non-null object
41 ClinVar_VCF_CLNVI 0 non-null float64
42 ClinVar_VCF_DBVARID 0 non-null float64
43 ClinVar_VCF_GENEINFO 3 non-null object
44 ClinVar_VCF_MC 0 non-null float64
45 ClinVar_VCF_ORIGIN 3 non-null float64
46 ClinVar_VCF_RS 4 non-null float64
47 ClinVar_VCF_SSR 0 non-null float64
48 ClinVar_VCF_ID 4 non-null float64
49 ClinVar_VCF_FILTER 0 non-null float64
50 LMMKnown_LMM_FLAGGED 7411 non-null bool
51 LMMKnown_ID 0 non-null float64
52 LMMKnown_FILTER"> 7411 non-null object
dtypes: bool(1), float64(24), int64(2), object(26)
memory usage: 2.9+ MB
List of genes having SNPs and INDELs (strictly speaking, mylist is a pandas.core.series.Series object, not a list)
mylist=df["Gencode_34_hugoSymbol"].value_counts()[1:]
mylist
BAGE2 319
KMT2C 122
CR392039.4 69
FAM230C 24
ANKRD36C 24
...
LINC00320 1
AP002373.1 1
YAP1 1
ALG8 1
CR392039.3 1
Name: Gencode_34_hugoSymbol, Length: 213, dtype: int64
df["Gencode_34_proteinChange"].value_counts()
Amino acid substitutions observed for 5 proteins
p.I311M 1
p.N455T 1
p.K412R 1
p.K410E 1
p.L22L 1
p.D112D 1
p.S35S 1
p.T194A 1
p.L198L 1
Name: Gencode_34_proteinChange, dtype: int64
Many columns of the df dataframe contain only NA values. Such columns will be removed.
Also last two columns will be also removed
df.replace('NA', np.nan, inplace=True)
for col in df:
if df[col].isnull().sum(axis=0)==7411:
df=df.drop(col, axis="columns")
df=df.drop(['LMMKnown_LMM_FLAGGED', 'LMMKnown_FILTER">'], axis="columns")
There are 9 SNPs that are located in protein coding sequences.
Indices of that SNPs can be determined by following code:
pr_index=df['Gencode_34_proteinChange'].index[df['Gencode_34_proteinChange'].notnull()].to_list()
pr_index
[524, 1390, 1399, 1400, 5038, 5039, 5040, 5041, 5042]
Extraction SNPs corresponding to proteins
pr=df.loc[pr_index]
pr.iloc[range(9),[0, 5, 8, 10]]
Gencode_34_hugoSymbol Gencode_34_variantClassification Gencode_34_refAllele Gencode_34_tumorSeqAllele2
524 ACIN1 MISSENSE T C
1390 ANKRD36C SPLICE_SITE T G
1399 ANKRD36C MISSENSE T C
1400 ANKRD36C MISSENSE T C
5038 MT-ATP6 SILENT G A
5039 MT-ND3 SILENT C T
5040 MT-ND6 SILENT G A
5041 MT-CYB MISSENSE A G
5042 MT-CYB SILENT A G
Conclusion: From the above table, we see that there are 4 missense mutations in genes: ACIN1 p.I311M,
ANKRD36C p.K412R, ANKRD36C p.K410E, MT-CYB p.T194A, and one splice site mutation in ANKRD36C p.N455T
Back to bioinformatics project page: Back to bioinformatics project page
Back to home: Back to home