# transcriptome quantification analysis package using multi-platform
# next-generation sequencing data


###############################
#   REVISION HISTORY
#
#   01/05/2009
# Utilized object-oriented programming.
# Sub-classed the code for handling Solexa and iGentifier data.
# Addition of new datatype support would be easy by sub-classing.
#
#   02/02/2009
# allow user to define length of tag sequence for analysis
#
#   02/07/2009
# Gene Ontology annotation & functional categorization added
# for this action, a cascading html page will be produced to let user browse
# the GO mapping result, as is done in EasyGO.



###############################
#   PROGRAM EXECUTION FLOW
#
# > read in sequence tag file
#   Tag file could be:
#   1. raw Solexa sequencing data
#      (this will require grouping to make counts for unique tags)
#   2. pre-compiled iGentifier tag seq
#      (grouping is assumed to be done already)
# > read in cDNA library file in fasta format, with annotation
# > use megablast/wu-blast(n) to map unique seq tags to cDNA library
#   please remember to format cDNA library fasta file
# > parse BLAST result
# > optionally, perform gene ontology categorization analysis
#   using pre-generated GO annotation of cDNA sequence libraries.

import os,sys

class seqTag:
	"""Overall class for sequence tag set handling.

	This class contains universal seq tag handling procedures.
	The (sequencing) platform-specific methods will sub-class from
	this parent class and implement their specific methods.
	So it is not possible to use this class directly.
	"""
	def __init__(self,arg_list=sys.argv[1:]):
		"""Initiate by processing file and create some attributes.

		arg_list

		Read the seq tag file (and do grouping).
		The cDNA library file name will be verified and saved for mapping.
		And it will be read to record its definition line.
		"""
		options = self.parse_argument(arg_list)
		self.opt = options # simply append ``options'' object to self, this is inherently convenient
		# append necessary attributes to object
		self.tag_data = [] # a (huge) list to hold the sequence tag data
		# make function calls
		print
		print '##  %-20s##' % 'Parse input files'
		self.read_tagFile() # read the sequence tag file into memory! this is platform-specific
		self.group_tagFile() # perform grouping
		self.write_tagFile() # write to fasta file for mapping (blast)
		self.parse_libFile()
	def parse_argument(self,arg_list):
		"""Parse the command line arguments, and produce an OptionParser object.

		argument:
		  arg_list: a list of (command line) arguments. usually be
		  sys.argv[1:]
		"""
		from optparse import OptionParser

		usage = 'Usage: %prog arguments... (-h to print help message)'
		opr = OptionParser(usage)
		opr.add_option('-t','--tag-file',action='store',dest='tagFile',help='Sequence tag data file (if analyzing Solexa data, must be in FASTQ format)')
		opr.add_option('-l','--lib-file',action='store',dest='libFile',help='Reference cDNA sequence file (fasta format)')
		#opr.add_option('-T','--tag-type',action='store',dest='type',help='Type of sequence tag [solexa|iGentifier]')
		opr.add_option('-b','--bases-used',action='store',dest='bases',type='int',help='Number of bases of sequence tags to be used for analysis')
		opr.add_option('-m','--mismatch-allowed',action='store',dest='mismatch',type='int',help='Number of mismatch bases allowed for tag-reference alignment (default 1)',default=1)
		opr.add_option('-g','--gap-allowed',action='store',dest='gap',type='int',help='Number of gaps allowed for tag mapping (default 0)',default=0)
		opr.add_option('-o','--out-file',action='store',dest='outFile',help='General name of output file')
		opr.add_option('-G','--gene-ontology',action='store_true',dest='go',default=False,help='Perform Gene Ontology functional analysis. Must also provide arguments to -A, -f options.')
		opr.add_option('-A','--go-annotation',action='store',dest='goAnnoFile',help='GO annotation file for cDNA library entries. This file should only have two columns: 1. sequence name, 2. GO account name (one in each row).')
		opr.add_option('-f','--term-files',action='store',dest='termFileDir',help='The directory where following files reside: term.txt, term2term.txt. They can be obtained from archive.geneontology.org/latest-full/ and should be of same version.')
		(options, args) = opr.parse_args(arg_list)
# Validate the arguments
		if options.tagFile is None:
			opr.error('Missing sequence tag file for option -t.')
		if not os.path.exists(options.tagFile):
			print 'Sequence tag file %s not found.' % options.tagFile
			sys.exit()
		if options.libFile is None:
			opr.error('Missing cDNA sequence file for option -l.')
		if not os.path.exists(options.libFile):
			print 'cDNA sequence file %s not found.' % options.libFile
			sys.exit()
		if options.outFile is None:
			opr.error('Missing output file for option -o.')
		if options.bases is None or options.bases == 0:
			options.use_base_restriction = False
		if options.go == True:
			if options.termFileDir == None:
				opr.error('Missing values for -f option when -G is used.')
			if not os.path.exists(options.termFileDir):
				print 'Term file directory does not exist.'
				sys.exit()
			else:
				termFile = os.path.join(options.termFileDir,'term.txt')
				if os.path.exists(termFile):
					options.termFile = termFile
				else:
					print 'File term.txt not found under directory ' + options.termFileDir
					sys.exit()
				t2tFile = os.path.join(options.termFileDir,'term2term.txt')
				if os.path.exists(t2tFile):
					options.t2tFile = t2tFile
				else:
					print 'File term2term.txt not found under directory: ' + options.termFileDir
					sys.exit()
			# then check -A
			if options.goAnnoFile == None:
				opr.error('Missing value for -A option when using -G.')
			if not os.path.exists(options.goAnnoFile):
				print 'GO annotation file for cDNA not found.'
				sys.exit()
		return options
	def parse_libFile(self):
		"""Parse the cDNA library file, and read the sequence definition line. 

		Produce:
		  self.cdna_desc
		"""
		# read fasta file, for definition line
		try:
			fin = open(self.opt.libFile)
		except IOError,msg:
			print msg
			sys.exit()
		cdna_desc = {} # key: seq name, val: description
		for line in fin:
			if line.startswith('>'):
				lst = line.rstrip().split()
				name = lst[0].replace('>','')
				cdna_desc[name] = ' '.join(lst[1:])
		fin.close()
		self.cdna_desc = cdna_desc
	def read_tagFile(self):
		"""Platform-specific tag file parser.
		
		The method will be implemented/overriden in subclasses, which
		allows handling different file formats for different
		sorces of sequence tags.

		Currently available implementations:
		[ Solexa file ] tag-read and grouping will be mutually executed,
		[ iGentifier] only read tag and pre-generated count data,
		no grouping is required.

		Produces:
		  self.tag_data
		"""
		pass
	def group_tagFile(self):
		"""Perform grouping to tag data.

		Produce:
		  self.tag_seq2id
		  self.tag_id2seq
		  self.tag_count

		Restrictions as implied by ``self.opt.use_base_restriction'' will apply
		"""
		self.tag_seq2id = {} # key: tag sequence, val: tag ID
		self.tag_id2seq = {} # key: tag ID, val: tag seq
		self.tag_count = {} # tag ID, val: count
		id = 0
		if self.opt.use_base_restriction == True:
			for seq in self.tag_data:
				seq = seq[:options.bases]
				if seq in self.tag_seq2id:
					self.tag_count[self.tag_seq2id[seq]] += 1
				else:
					id += 1
					self.tag_seq2id[seq] = id
					self.tag_count[id] = 1
					self.tag_id2seq[id] = seq
		else:
			for seq in self.tag_data:
				if seq in self.tag_seq2id:
					self.tag_count[self.tag_seq2id[seq]] += 1
				else:
					id += 1
					self.tag_seq2id[seq] = id
					self.tag_count[id] = 1
					self.tag_id2seq[id] = seq
		print '%d unique tags.' % id
		# remember to write a fasta file for Blast purpose
		self.write_tagFile()
	def write_tagFile(self):
		"""Write a fasta file to hold tag sequences.

		The file ``self.new_tag_file'' will be used in sequence comparison.

		Produce:
		  self.new_tag_file
		"""
		tag_file = self.opt.outFile+ '.tagFasta'
		fout = open(tag_file,'w')
		for seq,id in self.tag_seq2id.items():
			fout.write('>%d\n%s\n' % (id,seq))
		fout.close()
		self.new_tag_file = tag_file
	def tag_mapping(self):
		"""Type-specific tag-mapping method.

		Map the sequence tag to scaffold using BLAST-based methods.

		The method will be implemented/overriden in subclasses,
		which allows handling different mapping strategies
		and BLAST result parsing of different formats.

		The sequence tag will be write to a fasta file, and each
		tag will be named by its ID. The BLAST search will be
		performed using the tag file and scaffold file. Then
		the BLAST output will be parsed to generate scaffold-tag
		association.

		Produce:
		  self.tag_hit
		"""
		pass
	def DEG_output(self):
		"""Generate output files for digital gene expression analysis.

		Output files: 
		tags with single hit, tags with multiple hits, and tags with no hit at all.
		GO mapping result html file.

		If there's ``-G T'' option, the gene ontology mapping function
		will be carried out here, and perform output accordingly.
		"""
		print
		print '##  %-20s##' % 'Transcriptome quantification result'
		singlehit_number = 0
		multihit_number = 0
		fout = open(self.opt.outFile+'.singlehit','w')
		fout2 = open(self.opt.outFile+'.multihit','w')
		# this is for GO analysis function
		## to configure the function (allow mapping operations to be done
		# for single- and multiple-hit tags separately), just create
		# new variables like ``cdna_expression'' and submit it as
		# argument to self.map2go()
		cdna_expression = {} # key: cdna name, val: [] expression level (tag count)
		for tagid in self.tag_hit:
			if len(self.tag_hit[tagid]) == 1:
				fout.write('%s\t%d\t%s\t%s\n' % (
							self.tag_hit[tagid][0],
							self.tag_count[tagid],
							self.tag_id2seq[tagid],
							self.cdna_desc[self.tag_hit[tagid][0]]))
				singlehit_number += 1
			else:
				fout2.write('>%s\t%d\n' % (self.tag_id2seq[tagid],self.tag_count[tagid]))
				for hit in self.tag_hit[tagid]:
					fout2.write('\t%s\t%s\n' % (hit,self.cdna_desc[hit]))
				multihit_number += 1
			# store them in self.cdna_expression
			for hit in self.tag_hit[tagid]:
				if hit in cdna_expression:
					cdna_expression[hit].append(self.tag_count[tagid])
				else:
					cdna_expression[hit] = [self.tag_count[tagid]]
		fout.close()
		fout2.close()
		# now deals with no-hit tags
		nohit_number = 0
		fout = open(self.opt.outFile+'.nohit','w')
		for seq,tagid in self.tag_seq2id.items():
			if tagid not in self.tag_hit:
				fout.write('%s\t%d\n' % (seq,self.tag_count[tagid]))
				nohit_number += 1
		fout.close()
		print """
	 %d single-hit tags
	 %d multi-hit tags
	 %d tags has no hit.""" % (singlehit_number, multihit_number, nohit_number)
		#################
		# then check if GO categorization is needed
		if self.opt.go == False:
			return
		print
		print 'Performing Gene Ontology-mapping for cDNA entries with expression level.'
		self.parse_termFile()
		self.parse_goAnnoFile()
		self.map2go(cdna_expression)

	def parse_termFile(self):
		"""Parse GO term files (term.txt term2term.txt).

		Produce:
		  self.term_a2i
		  self.term_i2a
		  self.t2t_c2p
		  self.t2t_p2c
		"""
		print 'Parsing term files...'
		try:
			fin = open(self.opt.termFile)
		except IOError,msg:
			print msg
			sys.exit()
		a2i = {} # key: term acc, val: term ID
		i2a = {} # key: term id, val: [term acc, term name]
		self.rootTerm = {} # key: term name, val: term ID
		the_root = ('biological_process','molecular_function','cellular_component')
		for line in fin:
			lst = line.split('\t')
			a2i[lst[3]] = lst[0]
			i2a[lst[0]] = [lst[3], lst[1]]
			if lst[1] in the_root:
				self.rootTerm[lst[1]] = lst[0]
		fin.close()
		self.term_a2i = a2i
		self.term_i2a = i2a
		try:
			fin = open(self.opt.t2tFile)
		except IOError,msg:
			print msg
			sys.exit()
		c2p = {} # child2parent key: child ID, val: {} parent ID
		p2c = {} # parent2child key: parent ID, val: {} child ID
		for line in fin:
			lst = line.split('\t')
			if lst[3] in c2p:
				c2p[lst[3]][lst[2]] = 1
			else:
				c2p[lst[3]] = {lst[2] : 1}
			if lst[2] in p2c:
				p2c[lst[2]][lst[3]] = 1
			else:
				p2c[lst[2]] = {lst[3] : 1}
		fin.close()
		self.t2t_c2p = c2p
		self.t2t_p2c = p2c
	def parse_goAnnoFile(self):
		"""Parse GO annotation file for cDNA.

		The format of GO anno file is declared in usage region. An example:
		At5g45340 -tab- GO:0007896 -newline-
		At5g45340 -tab- GO:0007897 -newline-
		At5g45340 -tab- GO:0007898 -newline-
		...

		Produce:
		  self.cdna_goAnno
		"""
		print 'Parsing cDNA Gene Ontology annotation file...'
		try:
			fin = open(self.opt.goAnnoFile)
		except IOError,msg:
			print msg
			sys.exit()
		cdna_goAnno = {} # key: cdna name, val: {} term ID
		for line in fin:
			lst = line.strip().split('\t')
			if lst[1] in self.term_a2i:
				if lst[0] in cdna_goAnno:
					cdna_goAnno[lst[0]][self.term_a2i[lst[1]]] = 1
				else:
					cdna_goAnno[lst[0]] = { self.term_a2i[lst[1]] : 1 }
			else:
				print 'Unrecognized term account: ' + lst[1]
		fin.close()
		self.cdna_goAnno = cdna_goAnno

	def map2go(self,cdna_expression):
		"""Map cDNA entries to the GO hierarchy (up the tree), and make output.

		Attributes used here:
		  self.cdna_goAnno {cdna : { termID : 1 }}
		Argument:
		  cdna_expression {cdna : [ expression level ] }
		Produce:
		  self.map_content {term ID : { itemName : 1 } }
		"""
		print 'Perform mapping of cDNAs to Gene Ontology hierarchy...'
		self.map_content = {} # key: term id, val: {} item name
		for item in cdna_expression:
			if item not in self.cdna_goAnno:
				continue
			for tid in self.cdna_goAnno[item]:
				self.find_parent_recursive(tid,item)
		self.map2go_output(cdna_expression)
	def find_parent_recursive(self,cid,item):
		"""Map one child term up to the GO tree.

		Arguments:
		  cid: child term id
		  item: item name been annotated with cid
		Attributes used here:
		  self.t2t_c2p
		  self.rootTerm
		  self.map_content
		  self.map_content: {} key: term id, val: {} item names mapped to this term
		     mapping result will be recorded in this data structure
		"""
		if cid in self.map_content:
			self.map_content[cid][item] = 1
		else:
			self.map_content[cid] = {item : 1}
		if cid in self.t2t_c2p:
			# this term got parents
			for pid in self.t2t_c2p[cid]:
				self.find_parent_recursive(pid,item)
		return
	def map2go_output(self,cdna_expression):
		"""Generate html output for GO mapping result.

		Attributes used:
		  self.term
		  self.rootTerm
		  self.map_content
		  self.cdna_desc
		Argument:
		  cdna_expression
		"""
		print 'Generating mapping output in html format...'
		fout = open(self.opt.outFile + '.GOmap.html','w')
		fout.write("""
<html>
  <head>
    <title>CrossAnn: Gene Ontology mapping result</title>
  </head>
  <body>

  <script language="JavaScript">
  <!--
    function showhide(id){
		if(document.getElementById){
			if((document.getElementById(id).style.display == "block")){
				document.getElementById(id).style.display = 'none';
			}else{
				document.getElementById(id).style.display = 'block';
			}
		}else{
			if(document.layers){
				document.id.display = 
					(document.id.display == "block") ? 'none' : 'block';
			}else{
				document.all.id.style.display =
					(document.all.id.style.display == "block") ? 'none' : 'block';
			}
		}
	}
	-->
  </script>

  <br>
  <h3>CrossAnn: Gene Ontology mapping result</h3>
  <p>Click on the GO account number to browse through the result.</p>

""")
		from random import randint
		fout.write('<ul>\n')
		self.prstr = ''
		for rootName in ('biological_process','molecular_function','cellular_component'):
			root_id = self.rootTerm[rootName]
			if root_id in self.map_content:
				self.write_tree_recursive(root_id,cdna_expression,randint,fout)
		fout.write("""
    </ul>
  </body>
</html>
""")
		fout.close()
	def write_tree_recursive(self,pid,cdna_expression,randint,fout):
		"""Write sub-tree for a GO term to a file, down the tree.

		Attributes used:
		  self.map_content
		  self.t2t_p2c
		  self.cdna_desc
		  self.term_i2a
		Argument:
		  pid: parent term id
		  cdna_expression
		  randint: the method class
		  fout: a file object for output
		"""
		# judge if current term needs link
		if pid not in self.map_content:
			return
		randnum = randint(0,10)
		term_id = pid + '.' + str(randint(0,10))
		table_id = term_id + 't'
		print_link = False
		if pid in self.t2t_p2c:
			# this term gets child
			for cid in self.t2t_p2c[pid]:
				if cid in self.map_content:
					print_link = True
					break
			if print_link == True:
				# this term has fruitful child
				fout.write('<li><a href="javascript:showhide(\'%s\');">%s</a> %s ' % (term_id,self.term_i2a[pid][0],self.term_i2a[pid][1]))
			else:
				# this term does not have fruitful child
				fout.write('<li>%s %s ' % (self.term_i2a[pid][0],self.term_i2a[pid][1]))
		else:
			# this term does not have child at all
			fout.write('<li>%s %s ' % (self.term_i2a[pid][0],self.term_i2a[pid][1]))
		# table of cdna items
		fout.write('(cDNA: %d)' % len(self.map_content[pid]))
		#fout.write('<a href="javascript:showhide(\'%s\');">(cDNA entry details)</a>' % table_id)
		fout.write('</li>')
		#fout.write('<table id="%s" style="display: none;" cellspacing=1 bgcolor=black border=0>' % table_id)
		#for item in self.map_content[pid]:
		#	fout.write("""<tr><td bgcolor=white>%s</td>
		#		<td bgcolor=white>%s</td>
		#		<td bgcolor=white>%s</td></tr>""" % (
		#			item,
		#			self.cdna_desc[item],
		#			', '.join([str(kk) for kk in cdna_expression[item]]) ))
		#fout.write('</table>')
		# child terms
		if print_link == True:
			fout.write('<ul type=none id="%s" style="display: none;">' % term_id)
			for cid in self.t2t_p2c[pid]:
				self.write_tree_recursive(cid,cdna_expression,randint,fout)
			fout.write('</ul>')
		return



class seqTag_solexa(seqTag):
	"""For solexa data.
	
	"""
	def read_tagFile(self):
		"""Reads in sequence tag file (in FASTQ format)."""
		print 'Loading Solexa tag data...'
		try:
			fin = open(self.opt.tagFile)
		except IOError, msg:
			print msg
			sys.exit()
		# ** fastq File format **
		#  @I330_2_FC30HGNAAXX:3:1:0:355/1
		#  ACTTGCGTTCAAAGACTCGATGGTTCACGGGATTCTGCN
		#  +I330_2_FC30HGNAAXX:3:1:0:355/1
		#  \dRPbhUde`WHTZRcI]RSMPMGSQNUNIHG=NQKAJ;
		while True:
			line = fin.readline() # drop first line
			if not line:
				break
			self.tag_data.append(fin.readline().rstrip()) # read seq
			fin.readline() # drop third line
			fin.readline() # drop the scoring line
		fin.close()
	def tag_mapping(self):
		"""Perform megablast."""
		self.blastResult_file = self.opt.outFile+'.mgblast'
		print
		print '##  %-20s##' % 'Tag mapping'
		commandstr = 'megablast -i %s -d %s -F F -D 3 -m 8 -o %s' % (
				self.new_tag_file,
				self.opt.libFile,
				self.blastResult_file)
		print 'RUNNING: ' + commandstr
		status = os.system(commandstr)
		if status != 0:
			print 'MEGABLAST search error: please check if you have correctly formated the cDNA sequence using formatdb.'
			sys.exit()
		tag_hit = {} # key: tag ID, val: a list of hit names

		## parse result
		# output file will have following columns:
		# Query id, Subject id, % identity, alignment length, mismatches, gap openings, 
		# q. start, q. end,     s. start,   s. end,           e-value,    bit score
		try:
			fin = open(self.blastResult_file)
		except IOError, msg:
			print msg
			sys.exit()
		for line in fin:
			if line.startswith('#'):
				continue
			lst = line.rstrip().split('\t')
			tagid = int(lst[0])
			hit_name = lst[1].replace('lcl|','') # in megablast v2.2.19, 'lcl|' will be appended in front of the hit name
			# mismatch check
			if int(lst[4]) > self.opt.mismatch:
				continue
			# gap openings check
			if int(lst[5]) > self.opt.gap:
				continue
			if tagid in tag_hit:
				tag_hit[tagid].append(hit_name)
			else:
				tag_hit[tagid] = [hit_name]
		fin.close()
		self.tag_hit = tag_hit


class seqTag_iGentifier(seqTag):
	"""For iGentifier data."""
	def read_tagFile(self):
		"""Parse iGentifier sequence file, notice format below.

		Specially, this will do nucleotide code check. If ambiguity
		nucleotide code are found, the tag-mapping is preferably 
		done using WU-BLAST.
		But due to unavailability of WU-BLAST software, we resort to
		regular expression pattern match method, which is very slow.
		(NCBI-BLAST does not handle this)
		"""
		print 'Loading iGentifier tag data...'
		try:
			fin = open(self.opt.tagFile)
		except IOError, msg:
			print msg
			sys.exit()
		# IUPAC codes, as from http://www.bioinformatics.org/sms/iupac.html
		self.ambiguity_code = ['R','Y','S','W','K','M','B','D','H','V','N']
		# ** File format **
		# must be tab-deliminated, first column is tag sequence,
		# second column is count
		self.tag_has_ambiguity = False
		ambiguity_count = 0
		for line in fin:
			lst = line.rstrip().split('\t')
			# in my version of iGentifier data, they use hyphen to 
			# represent "any base", but in IUPAC, it should be "N"
			seq = lst[0].replace('-','N') 
			for letter in seq:
				if letter in self.ambiguity_code:
					self.tag_has_ambiguity = True
					ambiguity_count += 1
			# following is to make iGentifier tag data compatible with the file read method
			# from other classes, and to produce same output (self.tag_data)
			# repeat the tag sequence for the time indicated
			# in second column in the iGentifier data file
			#self.tag_data.extend([seq for i in range(int(lst[1]))])
			self.tag_data.extend([seq for i in range(int(lst[1])+1)])
		fin.close()
		if self.tag_has_ambiguity == True:
			print '%d ambiguity nucleotide letters were found in iGentifier sequence tag data.' % ambiguity_count
	def tag_mapping(self):
		"""Perform tag mapping.
		
		If tag sequence doesn't contain ambiguity, the mapping will be done
		using seqTag_solexa.tag_mapping() method.
		Else, it will be done using regular expression pattern match
		(rather reluctant choice).
		"""
		print
		print '##  %-20s##' % 'Tag mapping'
		if self.tag_has_ambiguity == True:
			# read cdna library file into memory
			try:
				fin = open(self.opt.libFile)
			except IOError,msg:
				print msg
				sys.exit()
			cdna_seq = {}
			this_name = ''
			this_seq = ''
			for line in fin:
				if line.startswith('>'):
					if this_seq:
						cdna_seq[this_name] = this_seq
					this_name = line.rstrip().split()[0].replace('>','')
					this_seq = ''
				else:
					this_seq += line.strip().replace(' ','')
			cdna_seq[this_name] = this_seq
			fin.close()
			tag_hit = {} # key: tag id, val: a list hit names
			for tag_id in self.tag_id2seq:
				print tag_id
				for cdna_name in cdna_seq:
					result = self.direct_match(self.tag_id2seq[tag_id],cdna_seq[cdna_name])
					if result == True:
						#print 'hit on ' + cdna_name
						if tag_id in tag_hit:
							tag_hit[tag_id].append(cdna_name)
						else:
							tag_hit[tag_id] = [ cdna_name ]
			self.tag_hit = tag_hit
		else:
			seqTag_solexa.tag_mapping(self)
	def direct_match(self,tag_seq,cdna_seq):
		"""Match tag seqeunce to cdna sequence.

		Tag seq have ambiguity code, that cannot be handled by ncbi-blast.

		Attributes:
		  self.ambiguity_code
		  self.opt.mismatch
		"""
		ambiguity_check = {
			'R':'AG',
			'Y':'CT',
			'S':'GC',
			'W':'AT',
			'K':'GT',
			'M':'AC',
			'B':'CGT',
			'D':'AGT',
			'H':'ACT',
			'V':'ACG',
			'N':'AGCT'
			}
		for i in range(len(cdna_seq)-len(tag_seq)+1):
			seg = cdna_seq[i:i+len(tag_seq)]
			mismatch = 0
			unmatch = False
			for j in range(len(seg)):
				if tag_seq[j] in self.ambiguity_code:
					if seg[j] not in ambiguity_check[tag_seq[j]]:
						mismatch += 1
						if mismatch > self.opt.mismatch:
							unmatch = True
							break
				else:
					if seg[j] != tag_seq[j]:
						mismatch += 1
						if mismatch > self.opt.mismatch:
							unmatch = True
							break
			if unmatch == False:
				return True



if __name__ == '__main__':
	#tag_instance = seqTag_solexa()
	tag_instance = seqTag_iGentifier()
	tag_instance.tag_mapping()
	tag_instance.DEG_output()

