# python3.8 __author__ = "Peng Zhang" __copyright__ = "Copyright 2025, " \ "Laboratory of Human Genetics of Infectious Diseases, The Rockefeller University" __license__ = "CC BY-NC-ND 4.0" __version__ = "ver-1, 05-01-2025" import os import re import time import argparse print('******************************************') print(' #### ##### # # ##### #### ##### ') print(' # # # # # # # # ## ') print(' # # # # # ### #### # ') print(' # # # # # # # # ## ') print(' #### ##### # ##### # # ##### ') print(' Deep Intronic Variants ') print(' with Effect on Recursive Splicing ') print('*******************************************\n') ### # input parameters ### timestamp = str(time.strftime('%Y%m%d-%H%M%S', time.localtime())) parser = argparse.ArgumentParser(description="DIVERS") parser.add_argument("-d", "--dir", help="directory of VCF files") parser.add_argument("-s", "--sample", help="sample list") parser.add_argument("-o", "--output", help="output filename, in CSV format") args = parser.parse_args() directory = args.dir filename_sample = args.sample filename_out = args.output if filename_out.endswith('.csv'): file_out = open(filename_out, 'w') else: file_out = open(filename_out+'.csv', 'w') base_pairing = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N'} def rev_seq(fwd_seq): out_seq = '' for fwd_pos in range(0, len(fwd_seq)): out_seq += base_pairing[fwd_seq[len(fwd_seq) - fwd_pos - 1]] return out_seq file_sample = open(filename_sample, 'r') first_read = True for eachsample in file_sample: sample = eachsample.strip() filename_var = sample+'.vcf' filename_bed = filename_var+'.bed' filename_map = filename_var+'.mapping' var_input_set = set() var_output_set = set() var_output_annot = 0 var_count_RS_AGGT = 0 var_count_RS_BP = 0 var_count_RS_BP2 = 0 var_count_RS_AGAIN = 0 var_count_RS_DW5SS = 0 var_count_CRYPRS = 0 ### # DIVERS running ### try: file_var = open(directory + filename_var, 'r') file_bed = open(filename_bed, 'w') if first_read: var_info_header = ','.join(file_var.readline().strip().split('\t')[5:]) file_out.write('SAMPLE,CHR,POS,ID,REF,ALT,STRAND,VAR_TYPE,GENE,TRANSCRIPT,IVS#,IVS_SIZE,' 'RS#,RS_CONSEQ,RS_SCORE,RS_POS,BP_POS,PPT_Y,CLIP,RARE,CONSERV,RNALM,'+var_info_header+'\n') first_read = False else: file_var.readline() for eachline in file_var: if not eachline.startswith('#'): eachline = eachline.replace(',', ';') eachline = eachline.replace('*', ';') item = eachline.strip().split('\t') chrom = item[0] pos = item[1] var_id = item[2] ref = item[3] alt = item[4] if 'chr' not in chrom: chrom = 'chr' + chrom var_name = chrom + '*' + pos + '*' + var_id + '*' + ref + '*' + alt var_info = ','.join(item[5:]) if not var_info: var_info = '.' var_start = var_end = var_type = '.' if ref.isalpha() and alt.isalpha(): if len(ref) == 1 and len(alt) == 1: var_type = 'snv' var_start = str(int(pos) - 1) var_end = pos elif 10 >= len(ref) > 1 == len(alt): var_type = 'del-' + str(len(ref) - 1) + 'nt' var_start = pos var_end = str(int(pos) + len(ref) - 1) elif 10 >= len(alt) > 1 == len(ref): var_type = 'ins-' + str(len(alt) - 1) + 'nt' var_start = pos var_end = str(int(pos) + len(alt) - 1) if (var_type != '.') and (var_name not in var_input_set): var_input_set.add(var_name) file_bed.write(chrom + '\t' + var_start + '\t' + var_end + '\t' + var_name + '\t.\t+\t' + var_type + '\t' + var_info + '\n') file_bed.write(chrom + '\t' + var_start + '\t' + var_end + '\t' + var_name + '\t.\t-\t' + var_type + '\t' + var_info + '\n') file_var.close() file_bed.close() # DIVERS mapping, detection and annotation os.system("bedtools intersect -wo -s -a " + filename_bed + " -b DIVERS_Detection.bed > " + filename_map) file_map = open(filename_map, 'r') for eachline in file_map: item = eachline.strip().split('\t') chrom = item[0] var_start = int(item[1]) var_end = int(item[2]) var_name = item[3] strand = item[5] var_type = item[6] var_info = item[7] RS_region_start = int(item[9]) RS_region_end = int(item[10]) gene = item[14] transcript = item[15] ivs = item[16] ivs_size = item[17] RS = item[18] BP_pos = item[19] PPT = item[20] RS_pos = item[21] clip = item[22] rare = item[23] conserv = item[24] rnalm = item[25] RS_score = item[26] RS_conseq = item[27] seq_wt = item[28] seq_mt = '.' var_name_item = var_name.split('*') var_pos = var_name_item[1] var_id = var_name_item[2] var_ref = var_name_item[3] var_alt = var_name_item[4] if var_info == '.': var_info = '' DIVERS_flag = 0 # RS-AGGT if RS_conseq == 'RS-AGGT': DIVERS_flag = 1 var_count_RS_AGGT += 1 var_output_set.add(var_name) # RS-BP/BP2 if RS_conseq == 'RS-BP': DIVERS_flag = 1 var_count_RS_BP += 1 var_output_set.add(var_name) if RS_conseq == 'RS-BP2': if var_type == 'snv': if ((strand == '+' and var_alt in ['A', 'G']) or (strand == '-' and var_alt in ['C', 'T'])): DIVERS_flag = 1 var_count_RS_BP2 += 1 var_output_set.add(var_name) else: DIVERS_flag = 1 var_count_RS_BP2 += 1 var_output_set.add(var_name) # RS-AGAIN if RS_conseq == 'RS-AGAIN': nucl_list = list(seq_wt) if var_type == 'snv': if strand == '+': var_index = int(var_pos) - RS_region_start - 1 nucl_list[var_index] = var_alt else: var_index = RS_region_end - int(var_pos) nucl_list[var_index] = base_pairing[var_alt] elif 'ins' in var_type: if strand == '+': var_index = int(var_pos) - RS_region_start - 1 nucl_list[var_index] = var_alt else: var_index = RS_region_end - int(var_pos) nucl_list[var_index] = rev_seq(var_alt) elif ('del' in var_type) and (RS_region_end - len(var_ref) > int(var_pos) > RS_region_start): if strand == '+': var_index = int(var_pos) - RS_region_start - 1 for temp_index in range(var_index + 1, var_index + len(var_ref)): nucl_list[temp_index] = '' else: var_index = RS_region_end - int(var_pos) for temp_index in range(var_index - len(var_ref) + 1, var_index): nucl_list[temp_index] = '' seq_mt = ''.join(nucl_list) AG_hit_wt = set(hit.end() for hit in re.finditer('AG', seq_wt)) AG_hit_mt = set(hit.end() for hit in re.finditer('AG', seq_mt)) AG_hit_set = AG_hit_mt - AG_hit_wt if 'del' in var_type: AG_hit_set_temp = set() for each_pos in AG_hit_set: if (each_pos + len(var_ref) - 1) not in AG_hit_wt: AG_hit_set_temp.add(each_pos) AG_hit_set = AG_hit_set_temp if 'ins' in var_type: AG_hit_set_temp = set() for each_pos in AG_hit_set: if (each_pos - len(var_alt) + 1) not in AG_hit_wt: AG_hit_set_temp.add(each_pos) AG_hit_set = AG_hit_set_temp if AG_hit_set: AG_hit_list = list(AG_hit_set) AG_hit_list.sort() AG_hit = AG_hit_list[-1] if seq_mt[AG_hit - 3] in ['C', 'T']: DIVERS_flag = 1 var_count_RS_AGAIN += 1 var_output_set.add(var_name) # RS-DW5SS if RS_conseq.startswith('RS-DW5SS'): seq_wt_item = seq_wt.split('|') check_ref = seq_wt_item[0] check_alt = seq_wt_item[1] if var_ref == check_ref and var_alt == check_alt: DIVERS_flag = 1 var_count_RS_DW5SS += 1 var_output_set.add(var_name) # CRYPRS if ('CRYPRS' in RS_conseq) and (var_name not in var_output_set): CRYPRS_type = seq_wt CRYPRS_flag = 0 if CRYPRS_type == 'NGGT': if (strand == '+' and var_alt == 'A') or (strand == '-' and var_alt == 'T'): CRYPRS_flag = 1 if CRYPRS_type == 'ANGT': if (strand == '+' and var_alt == 'G') or (strand == '-' and var_alt == 'C'): CRYPRS_flag = 1 if CRYPRS_type == 'AGNT': if (strand == '+' and var_alt == 'G') or (strand == '-' and var_alt == 'C'): CRYPRS_flag = 1 if CRYPRS_type == 'AGGN': if (strand == '+' and var_alt == 'T') or (strand == '-' and var_alt == 'A'): CRYPRS_flag = 1 if CRYPRS_flag: DIVERS_flag = 1 var_count_CRYPRS += 1 var_output_set.add(var_name) # DIVERS Output if DIVERS_flag: var_output_annot += 1 file_out.write(sample+','+chrom+','+var_pos+','+var_id+','+var_ref+','+var_alt+','+strand+','+var_type+','+ gene+','+transcript+','+ivs+','+ivs_size+','+RS+','+RS_conseq+','+RS_score+','+ RS_pos+','+BP_pos+','+PPT+','+clip+','+rare+','+conserv+','+rnalm+','+var_info+'\n') file_map.close() os.remove(filename_bed) os.remove(filename_map) print('Sample:', sample) print('Input file:', filename_var) print('Output file:', filename_out, '\n') print('# Input variants:', str(len(var_input_set))) print('# Output variants:', str(len(var_output_set))) print('# Output annotations:', str(var_output_annot), '\n') print('# RS-AGGT:\t', str(var_count_RS_AGGT)) print('# RS-BP:\t', str(var_count_RS_BP)) print('# RS-BP2:\t', str(var_count_RS_BP2)) print('# RS-AGAIN:\t', str(var_count_RS_AGAIN)) print('# RS-DW5SS:\t', str(var_count_RS_DW5SS)) print('# CRYPRS:\t', str(var_count_CRYPRS), '\n') except Exception as error_message: print(f"An error occurred: {error_message}") print("Please check your input file, or contact pzhang@rockefeller.edu.\n") file_out.close()