call SNP
一、参考基因组索引
需要构建索引
1、 bwa索引:bwa index Genome.fasta
2、 picard索引:java -jar picard.jar CreatSequenceDictionary R=Genome.fasta O=Genome.dict
3、 samtools faidx Genome.fasta
索引已经建好路径:/w/00/g/g04/user608/refgenome/Genome.fasta
二、个体和居群信息
将个体和居群信息对应起来
居群采样号 | 分析居群号 | |
---|---|---|
GGA-22-1 | P001H01A | |
GGA-22-2 | P002H01A |
三、Vcf
1、 比对个体bwa-mem
`$time -vo output.01.bam/Pd14008_04.01.bam.time bash -c "$bwa mem -t 8 -R '@RG\tID:Pd14008_04\tSM:Pd14008_04\tLB:Pd14008_04' data/Genome.fasta data/Pop/Pd14008_04.1.fq.gz data/Pop/Pd14008_04.2.fq.gz | $samtools view -1 -o output.01.bam/Pd14008_04.01.bam - 2> output.01.bam/Pd14008_04.01.bam.err"`
`任务设置:8 20`
`时间:大概10个小时左右`
2、 排序Picard-sortsam
任务设置:4 10
时间:1个小时30分钟左右
3、 标记重复picard.jar-MarkDuplicates
对于重复的去除问题和详细的解释,华大这篇推文很全面:《NGS攻略》之Dup传 (qq.com)
还有这两篇简书:关于二代测序中的Duplication - 简书 (jianshu.com)
讨厌又迷人的reads去重复 - 简书 (jianshu.com)
任务设置:8 40
时间:2个小时30分钟左右
4、 计算MD5和index Picard-sortsam
任务设置:4 10
时间:2个小时
5、 计算深度samtools depth
任务设置:2 3
时间:半个小时
测序深度的计算,你真的掌握了吗 - 腾讯云开发者社区-腾讯云 (tencent.com)
深度计算bedtools和samtools不太一样看,samtools会过滤flag 365的reads
6、 统计信息samtools stats
统计mapped_ratio、properlypaired_ratio、percent_duplication、genome_coverage
任务设置:1 1
时间:1个半小时
7、 出gvcf gatk haplotypecaller
分染色体gvcf
任务设置:4 20
时间:12个小时30分钟左右
Call 14-2718染色体
任务设置:20 20,感觉10G的内存就完全足够了
时间:2个小时30分钟左右
关于参数-ERC BP_RESOLUTION 和GVCF,后者使用块的方式输出每个位点,相比前者文件会更小,两种效果一样,不影响后续分析。但不清楚对合并效率有没有影响,先使用第一个。简书链接:gvcf文件与vcf文件 - 简书 (jianshu.com)
8、 分染色体合并,然后cat合并
任务设置:15 15
时间:3个小时左右
9、 将所有染色体合并combine_sum
任务设置:20 40
时间:14个小时
10、基因定型 Genotypes
任务设置:60 115分型16个个体
时间:
四、snp过滤
1、 标记质量值小于30和indel5bp
任务设置:10 10 此时一个gvcf应该是10G左右—threads没有用多线程
时间:12min
2、 删除低质量和indel5bp
3、 删除indel
4、 删除alle大于2
5、 依据深度过滤
根据师兄的脚本先把每个样本平均深度算出来
然后删除小于三分之一和大于平均深度三倍的点
算平均深度任务设置:5 5
时间:
过滤任务设置:
时间:
6、 删掉外类群(我没有不需要)
7、 根据缺失过滤&SNP频率过滤(个性化,需要根据群体情况调整)
8、 HWE,LD过滤(个性化,需要根据群体情况调整)
五、python代码协助生成命令
"""
拼写命令的脚本
日期:2022年1月13日
时间:22点14分
"""
import re
def bwa_mem_spell(dirlist):#拼写bwa mem命令
temp=[]
for arg1 in dirlist:
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/01bam_mem/"+arg1+"_bam_ali.time bash -c \"bwa mem -t 20 -R '@RG\\tID:"+arg1+\
"\\tSM:"+arg1+"\\tLB:Illumina' /w/00/g/g04/user608//w/00/g/g04/user608/refgenome/Genome.fasta /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/raw/IV-2019_0166A_1/"+\
arg1+"_raw_1.fq.gz "+"/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/raw/IV-2019_0166A_1/"+arg1+"_raw_2.fq.gz "+\
"| samtools view -1 -o /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/01bam_mem/"+arg1+".bam - 2> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/01bam_mem/"+arg1+".bam.err\""
temp.append(cmdline+"\n")
#print(cmdline)
return temp
def bam_picard_sort(dirlist):#拼写picard排序命令
temp=[]
for arg in dirlist:
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/02bam_sort/"+arg+"_bam_sort.time bash -c \"java -Xmx6G -jar /w/00/u/user608/scripts/picard/picard.jar SortSam I=/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/01bam_mem/"+\
arg+".bam O=/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/02bam_sort/"+arg+".sort.bam SORT_ORDER=coordinate 1> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/02bam_sort/"+arg+".sort.bam.sdout 2> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/02bam_sort/"+arg+".sort.bam.err\""
temp.append(cmdline+"\n")
print(cmdline)
return temp
def bam_picard_markdu(dirlist):#标记重复的命令
temp=[]
for arg in dirlist:
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/03bam_picard_markdu/"+arg+\
"_bam_picard_markdu.time bash -c \"java -Xmx40G -jar /w/00/u/user608/scripts/picard/picard.jar MarkDuplicates I=/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/02bam_sort/"+\
arg+".sort.bam O=/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/03bam_picard_markdu/"+arg+"_bam_picard_markdu.bam M=/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/03bam_picard_markdu/"+arg\
+"_bam_picard_markdu.bam.metrics OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 1> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/03bam_picard_markdu/"+arg\
+"_bam_picard_markdu.bam 2> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/03bam_picard_markdu/"+arg+"_bam_picard_markdu.bam.err\""
temp.append(cmdline+"\n")
print(cmdline)
return temp
def create_MD5_index(dirlist):#再次用sortsam来计算MD5值和index
temp=[]
for arg in dirlist:
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".time bash -c \"java -Xmx10G -jar /w/00/u/user608/scripts/picard/picard.jar SortSam COMPRESSION_LEVEL=2 "+\
"I=/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/03bam_picard_markdu/"+arg+"_bam_picard_markdu.bam O=/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+\
".bam SORT_ORDER=coordinate CREATE_MD5_FILE=true CREATE_INDEX=true TMP_DIR=/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/tmp/ 1> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+\
".stdout 2> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".err\""
temp.append(cmdline+"\n")
print(cmdline)
return temp
def compute_depth(dirlist):#计算深度的命令
temp=[]
for arg in dirlist:
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".bam.depth.time bash -c \"samtools depth /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+\
".bam | gzip > /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".bam.depth.gz 2> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".bam.depth.err\""
temp.append(cmdline+"\n")
print(cmdline+"\n")
return temp
def samtoos_stats(dirlist):#统计bam覆盖度
temp=[]
for arg in dirlist:
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".bam.stats.time bash -c \"samtools stats --threads 10 "+\
"/w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".bam > /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".bam.stats\""
temp.append(cmdline+"\n")
return temp
def gatk_haplotypeCaller(dirlist):#1-13分开,再加上14-2718条gatk haplotypecaller
temp=[]
for arg in dirlist:
'''
cmdline14_2718="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/sea_mission/jobs/time/05gvcf/HiC_scaffold_14_2718/"+arg+\
".gvcf.gz.time bash -c \"gatk --java-options '-Xmx20g' HaplotypeCaller -R /w/00/g/g04/user608/refgenome/Genome.fasta -I /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+\
".bam -ERC BP_RESOLUTION -ploidy 2 -L output/jobs/HiC_scaffold.list -O /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_14_2718/"+arg+\
".gvcf.gz 1> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_14_2718/"+arg+".gvcf.gz.stdout 2> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_14_2718/"+arg+".gvcf.gz.err\"
'''
for i in range(1,14):
cmdline= "/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_"+str(i)+"/"+arg+\
".gvcf.gz.time bash -c \"gatk --java-options '-Xmx20g' HaplotypeCaller -R /w/00/g/g04/user608/refgenome/Genome.fasta -I /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+\
".bam -ERC BP_RESOLUTION -ploidy 2 -L HiC_scaffold_"+str(i)+" -O /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_"+str(i)+"/"+arg+\
".gvcf.gz 1> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_"+str(i)+"/"+arg+".gvcf.gz.stdout 2> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_"+str(i)+"/"+arg+".gvcf.gz.err\""
temp.append(cmdline+"\n")
#print(cmdline)
#temp.append(cmdline14_2718+"\n")
#print(cmdline14_2718)
return temp
def combine_per(dirlist):#combinegvcfs将条染色体的样本合在一起
temp=[]
b=""
"""
先生成-V
再len dirlist的长度,进行循环生成命令
"""
for j in dirlist:
b=b+" -V /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_14_2718/"+str(j[0:9])+".04.list.gvcf.gz"
cmdline14_2718="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/war_mission/time/combine_per14_2718.time bash -c \"gatk CombineGVCFs -R /w/00/g/g04/user608/refgenome/Genome.fasta "+b+\
" -O /w/00/g/g04/user608/war_mission/combine_per/QJ.chr14_2718.gvcf.gz\""
b=""
#cmdline14_2718="-V"
for i in range(1,14):
cmdline1="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/war_mission/time/combine_per"+str(i)+".time bash -c \"gatk CombineGVCFs -R /w/00/g/g04/user608/refgenome/Genome.fasta"
cmdline2=" -O /w/00/g/g04/user608/war_mission/combine_per/QJ.chr"+str(i)+".gvcf.gz\""
if i==1:
for j in dirlist:
b=b+" -V /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_"+str(i)+"/"+str(j[0:9])+".04.list.gvcf.gz"
else:
for j in dirlist:
b=b+" -V /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_"+str(i)+"/"+str(j[0:9])+".04.list"+str(i)+".gvcf.gz"
cmdline=cmdline1+b+cmdline2
b=""
temp.append(cmdline+"\n")
temp.append(cmdline14_2718+"\n")
return temp
def genotypegvcfs():#genotypegvcfs将基因分型
temp=[]
cmdline14_2718="/data/apps/time/1.9/bin/time -vo output/jobs/time/GenotypeGVCFs/QJ.chr14-2718.time bash -c "+\
"\"gatk --java-options '-Xmx15g' GenotypeGVCFs --include-non-variant-sites -R /w/00/g/g04/user608//w/00/g/g04/user608/refgenome/Genome.fasta "+\
"-V /w/00/g/g04/user608/output/combine_per/QJ.chr14-2718.gvcf.gz -O output/GenotypeGVCFs/QJ.chr14-2718.vcf.gz -ploidy 2\""
for i in range(1,14):
cmdline="/data/apps/time/1.9/bin/time -vo output/jobs/time/GenotypeGVCFs/QJ.chr"+str(i)+".time bash -c "+\
"\"gatk --java-options '-Xmx15g' GenotypeGVCFs --include-non-variant-sites -R /w/00/g/g04/user608//w/00/g/g04/user608/refgenome/Genome.fasta "+\
"-V /w/00/g/g04/user608/output/combine_per/QJ.chr"+str(i)+".gvcf.gz -O output/GenotypeGVCFs/QJ.chr"+str(i)+".vcf.gz -ploidy 2\""
temp.append(cmdline+"\n")
temp.append(cmdline14_2718+"\n")
return temp
def filter1():#拼写过滤第一步的命令
temp=[]
cmdline14_2718="/data/apps/time/1.9/bin/time -vo output/jobs/time/filter/filter1/QJ.chr14-2718.filter1.time bash -c "+\
"\"bcftools filter -O z -o output/filter/filter1/QJ.chr14_2718.filter1.vcf.gz -s LOWQUAL -e 'QUAL<30' "+\
"--SnpGap 5 --set-GTs . /w/00/g/g04/user608/output/GenotypeGVCFs/QJ.chr14-2718.vcf.gz\""
for i in range(1,14):
cmdline="/data/apps/time/1.9/bin/time -vo output/jobs/time/filter/filter1/QJ.chr"+str(i)+".filter1.time bash -c "+\
"\"bcftools filter -O z -o output/filter/filter1/QJ.chr"+str(i)+".filter1.vcf.gz -s LOWQUAL -e 'QUAL<30' "+\
"--SnpGap 5 --set-GTs . /w/00/g/g04/user608/output/GenotypeGVCFs/QJ.chr"+str(i)+".vcf.gz\""
temp.append(cmdline+"\n")
temp.append(cmdline14_2718+"\n")
return temp
def filter2():#拼写过滤第二步的命令
temp=[]
cmdline14_2718="/data/apps/time/1.9/bin/time -vo output/jobs/time/filter/filter2/QJ.chr14-2718.filter2.time bash -c "+\
"\"gzip -dc output/filter/filter1/QJ.chr14-2718.filter1.vcf.gz |grep -v 'LOWQUAL'|grep -v 'SnpGap'"+\
"| bgzip -l 2 > output/filter/filter2/QJ.chr14-2718.filter2.vcf.gz\""
for i in range(1,14):
cmdline="/data/apps/time/1.9/bin/time -vo output/jobs/time/filter/filter2/QJ.chr"+str(i)+".filter2.time bash -c "+\
"\"gzip -dc output/filter/filter1/QJ.chr"+str(i)+".filter1.vcf.gz |grep -v 'LOWQUAL'|grep -v 'SnpGap'"+\
"| bgzip -l 2 > output/filter/filter2/QJ.chr"+str(i)+".filter2.vcf.gz\""
temp.append(cmdline+"\n")
temp.append(cmdline14_2718+"\n")
return temp
def filter3():#拼写过滤第三步的命令
temp=[]
cmdline14_2718="/data/apps/time/1.9/bin/time -vo output/jobs/time/filter/filter3/QJ.chr14-2718.filter3.time bash -c "+\
"\"bcftools view -v snps -O z -o output/filter/filter3/QJ.chr14-2718.filter3.vcf.gz "+\
"/w/00/g/g04/user608/output/filter/filter2/QJ.chr14-2718.filter2.vcf.gz\""
for i in range(1,14):
cmdline="/data/apps/time/1.9/bin/time -vo output/jobs/time/filter/filter3/QJ.chr"+str(i)+".filter3.time bash -c "+\
"\"bcftools view -v snps -O z -o output/filter/filter3/QJ.chr"+str(i)+".filter3.vcf.gz "+\
"/w/00/g/g04/user608/output/filter/filter2/QJ.chr"+str(i)+".filter2.vcf.gz\""
temp.append(cmdline+"\n")
temp.append(cmdline14_2718+"\n")
return temp
def filter4():#过滤第四步保留alle2
temp=[]
cmdline14_2718="/data/apps/time/1.9/bin/time -vo output/jobs/time/filter/filter4/QJ.chr14-2718.filter4.time bash -c "+\
"\"bcftools view -m2 -M2 -v snps -O z -o output/filter/filter4/QJ.chr14-2718.filter4.vcf.gz "+\
"output/filter/filter3/QJ.chr14-2718.filter3.vcf.gz\""
for i in range(1,14):
cmdline="/data/apps/time/1.9/bin/time -vo output/jobs/time/filter/filter4/QJ.chr"+str(i)+".filter4.time bash -c "+\
"\"bcftools view -m2 -M2 -v snps -O z -o output/filter/filter4/QJ.chr"+str(i)+".filter4.vcf.gz "+\
"output/filter/filter3/QJ.chr"+str(i)+".filter3.vcf.gz\""
temp.append(cmdline+"\n")
temp.append(cmdline14_2718+"\n")
return temp
def test():#md5校验
temp=[]
cmdline14_2718="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/output/05gvcf/md5.14-2718.time bash "+\
"-c \"md5sum /w/00/g/g04/user608/output/05gvcf/HiC_scaffold_14_2718/* > /w/00/g/g04/user608/output/05gvcf/md5.14-2718.txt\""
for i in range(1,14):
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/output/05gvcf/md5."+str(i)+".time bash "+\
"-c \"md5sum /w/00/g/g04/user608/output/05gvcf/HiC_scaffold_"+str(i)+"/* > /w/00/g/g04/user608/output/05gvcf/md5."+str(i)+".txt\""
temp.append(cmdline+"\n")
temp.append(cmdline14_2718+"\n")
return temp
def gvcf_per():
temp=[]
b=""
samlist=['P0188H12A', 'P0928H01A', 'P0928H02A', 'P0964H07A', 'P0964H14A', 'P1023H11A', 'P1023H13A', 'P1023H14A',
'P1070H14A', 'P1070H15A', 'P1082H13A', 'P1931H01A', 'P1931H02A', 'P1931H03A', 'P1931H04A', 'P1931H05A', 'P2019H08A',
'P2039H01A', 'P2039H03A', 'P2506H08A', 'P2739H09A', 'P2739H15A', 'P6130H03A', 'P6135H01A', 'P6135H06A',
'P6135H07A', 'P6145H04A', 'P6145H06A', 'P6145H07A', 'P6145H08A']
for i in samlist:
b=b+" -V /w/00/g/g04/user608/new_mission/gvcf_indiv/"+str(i)+".s0002.list.gvcf.gz"
for i in range(1,14):
cmdline1="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/new_mission/time/combine_per"+str(i)+".time bash -c \"gatk CombineGVCFs -R /w/00/g/g04/user608//w/00/g/g04/user608/refgenome/Genome.fasta"
cmdline2=" -L HiC_scaffold_"+str(i)+" -O /w/00/g/g04/user608/new_mission/gvcf/HiC_scaffold_"+str(i)+"/QJ.chr"+str(i)+".gvcf.gz\""
cmdline=cmdline1+b+cmdline2
temp.append(cmdline+"\n")
cmdline1="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/new_mission/time/combine_per14-2718.time bash -c \"gatk CombineGVCFs -R /w/00/g/g04/user608//w/00/g/g04/user608/refgenome/Genome.fasta"
cmdline2=" -L /w/00/g/g04/user608/new_mission/HiC_scaffold.list -O /w/00/g/g04/user608/new_mission/gvcf/HiC_scaffold_14_2718/QJ.chr14-2718.gvcf.gz\""
cmdline14_2718=cmdline1+b+cmdline2
temp.append(cmdline14_2718+"\n")
return temp
def sam_stats():#统计bam文件
temp=[]
filelist=['P0910H03A.04.bam', 'P0910H04A.04.bam', 'P1218H02A.04.bam', 'P1218H04A.04.bam', 'P1218H07A.04.bam', 'P1801H01A.04.bam', 'P1801H04A.04.bam',
'P1804H03A.04.bam', 'P1804H04A.04.bam', 'P1810H06A.04.bam', 'P1810H09A.04.bam', 'P1815H01A.04.bam', 'P1815H06A.04.bam', 'P2020H02A.04.bam',
'P2020H03A.04.bam', 'P2739H12A.04.bam']
filepath="/w/00/g/g04/user608/war_mission/bam/"
for i in filelist:
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/war_mission/time/"+str(i)+".stats.time bash -c \"samtools stats --threads 10 "+\
filepath+str(i)+" > "+filepath+str(i[0:9])+"/"+str(i)+".stats\""
temp.append(cmdline+"\n")
return temp
def plot_bamstats():#画图呈现bam统计结果
temp=[]
filelist=['P0910H03A.04.bam', 'P0910H04A.04.bam', 'P1218H02A.04.bam', 'P1218H04A.04.bam', 'P1218H07A.04.bam', 'P1801H01A.04.bam', 'P1801H04A.04.bam',
'P1804H03A.04.bam', 'P1804H04A.04.bam', 'P1810H06A.04.bam', 'P1810H09A.04.bam', 'P1815H01A.04.bam', 'P1815H06A.04.bam', 'P2020H02A.04.bam',
'P2020H03A.04.bam', 'P2739H12A.04.bam']
filepath="/w/00/g/g04/user608/war_mission/bam/"
for i in filelist:
cmdline="/data/apps/samtools/samtools-1.5/bin/plot-bamstats -p /w/00/g/g04/user608/war_mission/bam/"+str(i[0:9])+"/ "+filepath+str(i[0:9])+"/"+str(i)+".stats"
temp.append(cmdline+"\n")
return temp
def testqwer():
temp=[]
filelist=['P0910H03A.04.bam', 'P0910H04A.04.bam', 'P1218H02A.04.bam', 'P1218H04A.04.bam', 'P1218H07A.04.bam', 'P1801H01A.04.bam', 'P1801H04A.04.bam',
'P1804H03A.04.bam', 'P1804H04A.04.bam', 'P1810H06A.04.bam', 'P1810H09A.04.bam', 'P1815H01A.04.bam', 'P1815H06A.04.bam', 'P2020H02A.04.bam',
'P2020H03A.04.bam', 'P2739H12A.04.bam']
for i in filelist:
cmdline="/data/apps/time/1.9/bin/time -vo war_mission/time/"+str(i[0:9])+".chr14_2718.time bash -c \"gatk --java-options '-Xmx20g' HaplotypeCaller -R /w/00/g/g04/user608/refgenome/Genome.fasta -I "+\
"/w/00/g/g04/user608/war_mission/bam/"+str(i)+" -ERC BP_RESOLUTION -ploidy 2 -L output/jobs/HiC_scaffold.list "+\
"-O /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_14_2718/"+str(i[0:9])+".04.list.gvcf.gz 1> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_14_2718/"+str(i[0:9])+\
".04.list.gvcf.gz.stdout 2> /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/05gvcf/HiC_scaffold_14_2718/"+str(i[0:9])+".04.list.gvcf.gz.err\""
temp.append(cmdline+"\n")
return temp
def nquire_draw_cds_bam(dirlist):#生成nquire需要的cds区bam
temp=[]
for arg in dirlist:
cmdline="/data/apps/time/1.9/bin/time -vo /w/00/g/g04/user608/17nQuire/output/draw_region_"+arg+".bam.time "+\
"samtools view -hb -L /w/00/g/g04/user608/17nQuire/bam/cds_region /w/00/g/g04/user608/2023.01.30.Gentiana_cruciata/call_snp/04bam_MD5_index/"+arg+".bam "+\
"-o /w/00/g/g04/user608/17nQuire/output/"+arg+".bam\""
temp.append(cmdline+'\n')
return temp
def main():#主函数,需要修改列表和生成txt文件名字
filename="cmdline.txt"
dirlist=['P0166H01A','P0166H02A','P0166H03A','P0166H04A','P0166H05A']
cmdlines=compute_depth(dirlist)
with open(filename,"w") as f:
for cmdline in cmdlines:
f.write(cmdline)
if __name__== "__main__":
main()
PS:其实内存根本用不了那么多,一定要试着先跑一个,然后去看内存的使用情况,根据情况来跑后边的。