# python3.8 __author__ = "Peng Zhang" __copyright__ = "Copyright 2025, " \ "Laboratory of Human Genetics of Infectious Diseases, The Rockefeller University" __license__ = "CC BY-NC-ND 4.0" __version__ = "ver-1, 05-01-2025" import os import re import time import argparse print('******************************************') print(' #### ##### # # ##### #### ##### ') print(' # # # # # # # # ## ') print(' # # # # # ### #### # ') print(' # # # # # # # # ## ') print(' #### ##### # ##### # # ##### ') print(' Deep Intronic Variants ') print(' with Effect on Recursive Splicing ') print('*******************************************\n') ### # input parameters ### timestamp = str(time.strftime('%Y%m%d-%H%M%S', time.localtime())) parser = argparse.ArgumentParser(description="DIVERS") parser.add_argument("-i", "--input", help="input variants, in VCF format") args = parser.parse_args() filename_var = args.input filename_bed = filename_var+'.bed' filename_map = filename_var+'.mapping' filename_out = filename_var+'.DIVERS_'+timestamp+'.csv' base_pairing = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N'} def rev_seq(fwd_seq): out_seq = '' for fwd_pos in range(0, len(fwd_seq)): out_seq += base_pairing[fwd_seq[len(fwd_seq) - fwd_pos - 1]] return out_seq var_input_set = set() var_output_set = set() var_output_annot = 0 var_count_RS_AGGT = 0 var_count_RS_BP = 0 var_count_RS_BP2 = 0 var_count_RS_AGAIN = 0 var_count_RS_DW5SS = 0 var_count_CRYPRS = 0 ### # DIVERS running ### try: file_var = open(filename_var, 'r') file_bed = open(filename_bed, 'w') var_info_header = ','.join(file_var.readline().strip().split('\t')[5:]) for eachline in file_var: if not eachline.startswith('#'): eachline = eachline.replace(',', ';') eachline = eachline.replace('*', ';') item = eachline.strip().split('\t') chrom = item[0] pos = item[1] var_id = item[2] ref = item[3] alt = item[4] if 'chr' not in chrom: chrom = 'chr' + chrom var_name = chrom+'*'+pos+'*'+var_id+'*'+ref+'*'+alt var_info = ','.join(item[5:]) if not var_info: var_info = '.' var_start = var_end = var_type = '.' if ref.isalpha() and alt.isalpha(): if len(ref) == 1 and len(alt) == 1: var_type = 'snv' var_start = str(int(pos)-1) var_end = pos elif 10 >= len(ref) > 1 == len(alt): var_type = 'del-'+str(len(ref)-1)+'nt' var_start = pos var_end = str(int(pos)+len(ref)-1) elif 10 >= len(alt) > 1 == len(ref): var_type = 'ins-'+str(len(alt)-1)+'nt' var_start = pos var_end = str(int(pos)+len(alt)-1) if (var_type != '.') and (var_name not in var_input_set): var_input_set.add(var_name) file_bed.write(chrom+'\t'+var_start+'\t'+var_end+'\t'+var_name+'\t.\t+\t'+var_type+'\t'+var_info+'\n') file_bed.write(chrom+'\t'+var_start+'\t'+var_end+'\t'+var_name+'\t.\t-\t'+var_type+'\t'+var_info+'\n') file_var.close() file_bed.close() # DIVERS mapping, detection and annotation os.system("bedtools intersect -wo -s -a " + filename_bed + " -b DIVERS_Detection.bed > " + filename_map) file_map = open(filename_map, 'r') file_out = open(filename_out, 'w') file_out.write('CHR,POS,ID,REF,ALT,STRAND,VAR_TYPE,GENE,TRANSCRIPT,IVS#,IVS_SIZE,' 'RS#,RS_CONSEQ,RS_SCORE,RS_POS,BP_POS,PPT_Y,CLIP,RARE,CONSERV,RNALM,'+var_info_header+'\n') for eachline in file_map: item = eachline.strip().split('\t') chrom = item[0] var_start = int(item[1]) var_end = int(item[2]) var_name = item[3] strand = item[5] var_type = item[6] var_info = item[7] RS_region_start = int(item[9]) RS_region_end = int(item[10]) gene = item[14] transcript = item[15] ivs = item[16] ivs_size = item[17] RS = item[18] BP_pos = item[19] PPT = item[20] RS_pos = item[21] clip = item[22] rare = item[23] conserv = item[24] rnalm = item[25] RS_score = item[26] RS_conseq = item[27] seq_wt = item[28] seq_mt = '.' var_name_item = var_name.split('*') var_pos = var_name_item[1] var_id = var_name_item[2] var_ref = var_name_item[3] var_alt = var_name_item[4] if var_info == '.': var_info = '' DIVERS_flag = 0 # RS-AGGT if RS_conseq == 'RS-AGGT': DIVERS_flag = 1 var_count_RS_AGGT += 1 var_output_set.add(var_name) # RS-BP/BP2 if RS_conseq == 'RS-BP': DIVERS_flag = 1 var_count_RS_BP += 1 var_output_set.add(var_name) if RS_conseq == 'RS-BP2': if var_type == 'snv': if ((strand == '+' and var_alt in ['A','G']) or (strand == '-' and var_alt in ['C','T'])): DIVERS_flag = 1 var_count_RS_BP2 += 1 var_output_set.add(var_name) else: DIVERS_flag = 1 var_count_RS_BP2 += 1 var_output_set.add(var_name) # RS-AGAIN if RS_conseq == 'RS-AGAIN': nucl_list = list(seq_wt) if var_type == 'snv': if strand == '+': var_index = int(var_pos)-RS_region_start-1 nucl_list[var_index] = var_alt else: var_index = RS_region_end-int(var_pos) nucl_list[var_index] = base_pairing[var_alt] elif 'ins' in var_type: if strand == '+': var_index = int(var_pos)-RS_region_start-1 nucl_list[var_index] = var_alt else: var_index = RS_region_end-int(var_pos) nucl_list[var_index] = rev_seq(var_alt) elif ('del' in var_type) and (RS_region_end-len(var_ref) > int(var_pos) > RS_region_start): if strand == '+': var_index = int(var_pos)-RS_region_start-1 for temp_index in range(var_index+1, var_index+len(var_ref)): nucl_list[temp_index] = '' else: var_index = RS_region_end-int(var_pos) for temp_index in range(var_index-len(var_ref)+1, var_index): nucl_list[temp_index] = '' seq_mt = ''.join(nucl_list) AG_hit_wt = set(hit.end() for hit in re.finditer('AG', seq_wt)) AG_hit_mt = set(hit.end() for hit in re.finditer('AG', seq_mt)) AG_hit_set = AG_hit_mt - AG_hit_wt if 'del' in var_type: AG_hit_set_temp = set() for each_pos in AG_hit_set: if (each_pos+len(var_ref)-1) not in AG_hit_wt: AG_hit_set_temp.add(each_pos) AG_hit_set = AG_hit_set_temp if 'ins' in var_type: AG_hit_set_temp = set() for each_pos in AG_hit_set: if (each_pos-len(var_alt)+1) not in AG_hit_wt: AG_hit_set_temp.add(each_pos) AG_hit_set = AG_hit_set_temp if AG_hit_set: AG_hit_list = list(AG_hit_set) AG_hit_list.sort() AG_hit = AG_hit_list[-1] if seq_mt[AG_hit-3] in ['C','T']: DIVERS_flag = 1 var_count_RS_AGAIN += 1 var_output_set.add(var_name) # RS-DW5SS if RS_conseq.startswith('RS-DW5SS'): seq_wt_item = seq_wt.split('|') check_ref = seq_wt_item[0] check_alt = seq_wt_item[1] if var_ref == check_ref and var_alt == check_alt: DIVERS_flag = 1 var_count_RS_DW5SS += 1 var_output_set.add(var_name) # CRYPRS if ('CRYPRS' in RS_conseq) and (var_name not in var_output_set): CRYPRS_type = seq_wt CRYPRS_flag = 0 if CRYPRS_type == 'NGGT': if (strand == '+' and var_alt == 'A') or (strand == '-' and var_alt == 'T'): CRYPRS_flag = 1 if CRYPRS_type == 'ANGT': if (strand == '+' and var_alt == 'G') or (strand == '-' and var_alt == 'C'): CRYPRS_flag = 1 if CRYPRS_type == 'AGNT': if (strand == '+' and var_alt == 'G') or (strand == '-' and var_alt == 'C'): CRYPRS_flag = 1 if CRYPRS_type == 'AGGN': if (strand == '+' and var_alt == 'T') or (strand == '-' and var_alt == 'A'): CRYPRS_flag = 1 if CRYPRS_flag: DIVERS_flag = 1 var_count_CRYPRS += 1 var_output_set.add(var_name) # DIVERS Output if DIVERS_flag: var_output_annot += 1 file_out.write(chrom+','+var_pos+','+var_id+','+var_ref+','+var_alt+','+strand+','+var_type+','+ gene+','+transcript+','+ivs+','+ivs_size+','+RS+','+RS_conseq+','+RS_score+','+ RS_pos+','+BP_pos+','+PPT+','+clip+','+rare+','+conserv+','+rnalm+','+var_info+'\n') file_out.close() file_map.close() os.remove(filename_bed) os.remove(filename_map) print('Input file:', filename_var) print('Output file:', filename_out, '\n') print('# Input variants:', str(len(var_input_set))) print('# Output variants:', str(len(var_output_set))) print('# Output annotations:', str(var_output_annot), '\n') print('# RS-AGGT:\t', str(var_count_RS_AGGT)) print('# RS-BP:\t', str(var_count_RS_BP)) print('# RS-BP2:\t', str(var_count_RS_BP2)) print('# RS-AGAIN:\t', str(var_count_RS_AGAIN)) print('# RS-DW5SS:\t', str(var_count_RS_DW5SS)) print('# CRYPRS:\t', str(var_count_CRYPRS), '\n') except Exception as error_message: print(f"An error occurred: {error_message}") print("Please check your input file, or contact pzhang@rockefeller.edu.\n")