很多时候,我们需要对取出的SNV进行注释,这个时候可能会在R上进行注释,通常注释文件都含有Chr染色体)、Start开始位点)、End结束位点)、Description(描述),而我们的SNV文件通常是拥有Position(位置),因此我们可以先定位Chr,再用Postion去定位到Start和End之间,找到相对应的Description。为了加快速度,可以使用二分查找法。
1 for value in dt$value){ 2 #df:data.frame, V1 and V2 should be Start and End value: Postition used to find region return:df row number where position locates ,if no region return -1 3 low=1 4 high=nrowdf) 5 mid=high %/% 2 6 if df[low,1] <= value & value <= df[low,2]) low 7 else if df[high,1] <= value & value <= df[high,2]) high 8 else{ 9 while value > df[mid,2] || value < df[mid,1]){ 10 if value > df[mid,2]){ 11 low = mid+1 12 } else if value < df[mid,1]) { 13 high = mid - 1 14 } 15 ifhigh<low){ 16 mid=-1;break 17 } 18 mid=low+high)%/%2 19 } 20 mid 21 } 22 }
在R中使用for循环效率低,因此也可以用data.table包的foverlap函数,改进代码如下,对bed文件进行注释,如果要对snv进行注释,只需要将snv改成相应的start和end相等的bed文件即可。
1 #!/bin/Rscript 2 3 librarydata.table) 4 5 arg <- commandArgsT) 6 if lengtharg) != 3) { 7 message"[usage]: BedAnnoGene.R bedfile gtffile outputfile") 8 message" bedfile format: chr start end informationArbitrary but can not be lacked)") 9 message" GTFfile: gtf file downloaded from GENCODE") 10 message" outputfile: file to be writen out") 11 message" needed package: data.table 1.10.4") 12 stop"Please check your arguments!") 13 } 14 15 bedfile <- arg[1] 16 annofile <- arg[2] 17 outfile <- arg[3] 18 19 #read file 20 anno <- freadannofile,sep=" ",header=F) 21 bed <- freadbedfile,sep=" ",header=F) 22 setnamesanno,c"V1","V2","V3","V4","V5","V9"),c"Chr","Gene","Type","Start","End","Info")) 23 anno <- anno[Type=="gene",.Chr,Start,End,Gene=sapplystrsplittstrsplitInfo,";")[3][[1]],"""),functionx)x[2]))] 24 setkeyanno,Chr,Start,End) 25 setkeybed,V1,V2,V3) 26 27 #find overlaps by Chr 28 lst <- list) 29 for ChrI in intersectuniquebed$V1),uniqueanno$Chr))){ 30 anno_reg <- anno[Chr == ChrI,.Start,End)] 31 bed_reg <- bed[V1 == ChrI,.V2,V3)] 32 setkeyanno_reg,Start,End) 33 setkeybed_reg,V2,V3) 34 overl <- foverlapsbed_reg,anno_reg,which=TRUE,nomatch = 0) 35 if nrowoverl) > 0){ 36 lst[[ChrI]] <- data.tableChr=ChrI,bed[V1 == ChrI,][overl[["xid"]],.V2,V3,V4)],anno[Chr == ChrI][overl[["yid"]],.Gene)]) 37 } 38 } 39 merge_dt <- rbindlistlst) 40 setnamesmerge_dt,c"V2","V3","V4"),c"Start","End","Name")) 41 42 #if one region has more than one gene 43 torm <- list) 44 for i in 1:nrowmerge_dt)-1)){ifmerge_dt[i,"Name"]==merge_dt[i+1,"Name"]){setmerge_dt,i+1L,ncolmerge_dt),pastemerge_dt[i,"Gene"],merge_dt[i+1,"Gene"],sep=";"));torm <- ctorm,listi))}} 45 torm <- unlisttorm) 46 merge_dt <- merge_dt[-torm,] 47 48 fwritemerge_dt,file=outfile)
使用帮助可以在我github看到 https://github.com/yiliu4234/BedAnnoGene