lutin/cppParser/Parse.py

#!/usr/bin/python
import os
import sys
import re

import ply.lex as lex

import inspect
import lutinDebug as debug
import lutinTools
import Class

tokens = [
	'NUMBER',
	'NAME',
	'OPEN_PAREN',
	'CLOSE_PAREN',
	'OPEN_BRACE',
	'CLOSE_BRACE',
	'OPEN_SQUARE_BRACKET',
	'CLOSE_SQUARE_BRACKET',
	'COLON',
	'SEMI_COLON',
	'COMMA',
	'TAB',
	'BACKSLASH',
	'PIPE',
	'PERCENT',
	'EXCLAMATION',
	'CARET',
	'COMMENT_SINGLELINE',
	'COMMENT_MULTILINE',
	'PRECOMP_MACRO',
	'PRECOMP_MACRO_CONT',
	'ASTERISK',
	'AMPERSTAND',
	'EQUALS',
	'MINUS',
	'PLUS',
	'DIVIDE',
	'CHAR_LITERAL',
	'STRING_LITERAL',
	'NEW_LINE',
	'SQUOTE',
]

t_ignore = " \r.?@\f"
t_NUMBER = r'[0-9][0-9XxA-Fa-f]*'
t_NAME = r'[<>A-Za-z_~][A-Za-z0-9_]*'
t_OPEN_PAREN = r'\('
t_CLOSE_PAREN = r'\)'
t_OPEN_BRACE = r'{'
t_CLOSE_BRACE = r'}'
t_OPEN_SQUARE_BRACKET = r'\['
t_CLOSE_SQUARE_BRACKET = r'\]'
t_SEMI_COLON = r';'
t_COLON = r':'
t_COMMA = r','
t_TAB = r'\t'
t_BACKSLASH = r'\\'
t_PIPE = r'\|'
t_PERCENT = r'%'
t_CARET = r'\^'
t_EXCLAMATION = r'!'
t_PRECOMP_MACRO = r'\#.*'
t_PRECOMP_MACRO_CONT = r'.*\\\n'
def t_COMMENT_SINGLELINE(t):
	r'\/\/.*\n'
	global doxygenCommentCache
	if t.value.startswith("///") or t.value.startswith("//!"):
		if doxygenCommentCache:
			doxygenCommentCache += "\n"
		if t.value.endswith("\n"):
			doxygenCommentCache += t.value[:-1]
		else:
			doxygenCommentCache += t.value
	t.lexer.lineno += len(filter(lambda a: a=="\n", t.value))
t_ASTERISK = r'\*'
t_MINUS = r'\-'
t_PLUS = r'\+'
t_DIVIDE = r'/(?!/)'
t_AMPERSTAND = r'&'
t_EQUALS = r'='
t_CHAR_LITERAL = "'.'"
t_SQUOTE = "'"
#found at http://wordaligned.org/articles/string-literals-and-regular-expressions
#TODO: This does not work with the string "bla \" bla"
t_STRING_LITERAL = r'"([^"\\]|\\.)*"'
#Found at http://ostermiller.org/findcomment.html
def t_COMMENT_MULTILINE(t):
	r'/\*([^*]|\n|(\*+([^*/]|\n)))*\*+/'
	global doxygenCommentCache
	if t.value.startswith("/**") or t.value.startswith("/*!"):
		#not sure why, but get double new lines
		v = t.value.replace("\n\n", "\n")
		#strip prefixing whitespace
		v = re.sub("\n[\s]+\*", "\n*", v)
		doxygenCommentCache += v
	t.lexer.lineno += len(filter(lambda a: a=="\n", t.value))
def t_NEWLINE(t):
	r'\n+'
	t.lexer.lineno += len(t.value)

def t_error(v):
	print( "Lex error: ", v )

lex.lex()


class TagStr(str):
	"""Wrapper for a string that allows us to store the line number associated with it"""
	lineno_reg = {}
	def __new__(cls,*args,**kw):
		new_obj =  str.__new__(cls,*args)
		if "lineno" in kw:
			TagStr.lineno_reg[id(new_obj)] = kw["lineno"]
		return new_obj

	def __del__(self):
		try:
			del TagStr.lineno_reg[id(self)]
		except: pass

	def lineno(self):
		return TagStr.lineno_reg.get(id(self), -1)


doxygenCommentCache = ""

#Track what was added in what order and at what depth
parseHistory = []

def is_namespace(nameStack):
	"""Determines if a namespace is being specified"""
	if len(nameStack) == 0:
		return False
	if nameStack[0] == "namespace":
		return True
	return False

def is_enum_namestack(nameStack):
	"""Determines if a namestack is an enum namestack"""
	if len(nameStack) == 0:
		return False
	if nameStack[0] == "enum":
		return True
	if     len(nameStack) > 1 \
	   and nameStack[0] == "typedef" \
	   and nameStack[1] == "enum":
		return True
	return False

def is_fundamental(s):
	for a in s.split():
		if a not in ["size_t", \
		             "struct", \
		             "union", \
		             "unsigned", \
		             "signed", \
		             "bool", \
		             "char", \
		             "short", \
		             "int", \
		             "float", \
		             "double", \
		             "long", \
		             "void", \
		             "*"]:
			return False
	return True

def is_function_pointer_stack(stack):
	"""Count how many non-nested paranthesis are in the stack.  Useful for determining if a stack is a function pointer"""
	paren_depth = 0
	paren_count = 0
	star_after_first_paren = False
	last_e = None
	for e in stack:
		if e == "(":
			paren_depth += 1
		elif     e == ")" \
		     and paren_depth > 0:
			paren_depth -= 1
			if paren_depth == 0:
				paren_count += 1
		elif     e == "*" \
		     and last_e == "(" \
		     and paren_count == 0 \
		     and paren_depth == 1:
			star_after_first_paren = True
		last_e = e
	if star_after_first_paren and paren_count == 2:
		return True
	else:
		return False

def is_method_namestack(stack):
	r = False
	if '(' not in stack:
		r = False
	elif stack[0] == 'typedef':
		r = False	# TODO deal with typedef function prototypes
	#elif '=' in stack and stack.index('=') < stack.index('(') and stack[stack.index('=')-1] != 'operator': r = False	#disabled July6th - allow all operators
	elif 'operator' in stack:
		r = True	# allow all operators
	elif     '{' in stack \
	     and stack.index('{') < stack.index('('):
		r = False	# struct that looks like a method/class
	elif     '(' in stack \
	     and ')' in stack:
		if     '{' in stack \
		   and '}' in stack:
			r = True
		elif stack[-1] == ';':
			if is_function_pointer_stack(stack):
				r = False
			else:
				r = True
		elif '{' in stack:
			r = True	# ideally we catch both braces... TODO
	else:
		r = False
	#Test for case of property set to something with parens such as "static const int CONST_A = (1 << 7) - 1;"
	if     r \
	   and "(" in stack \
	   and "=" in stack \
	   and 'operator' not in stack:
		if stack.index("=") < stack.index("("): r = False
	return r

def is_property_namestack(nameStack):
	r = False
	if     '(' not in nameStack \
	   and ')' not in nameStack:
		r = True
	elif     "(" in nameStack \
	     and "=" in nameStack \
	     and nameStack.index("=") < nameStack.index("("):
		r = True
	#See if we are a function pointer
	if     not r \
	   and is_function_pointer_stack(nameStack):
		r = True
	return r

def detect_lineno(s):
	"""Detect the line number for a given token string"""
	try:
		rtn = s.lineno()
		if rtn != -1:
			return rtn
	except: pass
	global curLine
	return curLine

def filter_out_attribute_keyword(stack):
	"""Strips __attribute__ and its parenthetical expression from the stack"""
	if "__attribute__" not in stack:
		return stack
	try:
		debug.debug("Stripping __attribute__ from %s"% stack)
		attr_index = stack.index("__attribute__")
		attr_end = attr_index + 1 #Assuming not followed by parenthetical expression which wont happen
		#Find final paren
		if stack[attr_index + 1] == '(':
			paren_count = 1
			for i in xrange(attr_index + 2, len(stack)):
				elm = stack[i]
				if elm == '(':
					paren_count += 1
				elif elm == ')':
					paren_count -= 1
					if paren_count == 0:
						attr_end = i + 1
						break
		new_stack = stack[0:attr_index] + stack[attr_end:]
		debug.debug("stripped stack is %s"% new_stack)
		return new_stack
	except:
		return stack


supportedAccessSpecifier = [
	'public',
	'protected',
	'private'
]


##
## @brief Join the class name element : ['class', 'Bar', ':', ':', 'Foo'] -> ['class', 'Bar::Foo']
## @param table Input table to convert. ex: [':', '\t', 'class', 'Bar', ':', ':', 'Foo']
## @return The new table. ex: ['class', 'Bar::Foo']
##
def create_compleate_class_name(table):
	compleateLine = ""
	compleateLine = compleateLine.join(table);
	if "::" not in compleateLine:
		return table
	# we need to convert it :
	out = []
	for name in table:
		if len(out) == 0:
			out.append(name)
		elif     name == ":" \
		     and out[-1].endswith(":"):
			out[-1] += name
		elif out[-1].endswith("::"):
			out[-2] += out[-1] + name
			del out[-1]
		else:
			out.append(name)
	return out

class parse_file():

	def __init__(self, fileName):
		self.m_classes = []
		self.m_elementParseStack = []
		debug.info("Parse File tod document : '" + fileName + "'")

		self.headerFileName = fileName

		self.anon_union_counter = [-1, 0]
		# load all the file data :
		headerFileStr = lutinTools.FileReadData(fileName)

		# Make sure supportedAccessSpecifier are sane
		for i in range(0, len(supportedAccessSpecifier)):
			if " " not in supportedAccessSpecifier[i]: continue
			supportedAccessSpecifier[i] = re.sub("[ ]+", " ", supportedAccessSpecifier[i]).strip()

		# Strip out template declarations
		# TODO : What is the real need ???
		headerFileStr = re.sub("template[\t ]*<[^>]*>", "", headerFileStr)
		# remove all needed \r unneeded ==> this simplify next resExp ...
		headerFileStr = re.sub("\r", "\r\n", headerFileStr)
		headerFileStr = re.sub("\r\n\n", "\r\n", headerFileStr)
		headerFileStr = re.sub("\r", "", headerFileStr)
		# TODO : Can generate some error ...
		headerFileStr = re.sub("\#if 0(.*?)(\#endif|\#else)", "", headerFileStr, flags=re.DOTALL)

		debug.debug(headerFileStr)

		# Change multi line #defines and expressions to single lines maintaining line nubmers
		matches = re.findall(r'(?m)^(?:.*\\\n)+.*$', headerFileStr)
		is_define = re.compile(r'[ \t\v]*#[Dd][Ee][Ff][Ii][Nn][Ee]')
		for m in matches:
			#Keep the newlines so that linecount doesnt break
			num_newlines = len(filter(lambda a: a=="\n", m))
			if is_define.match(m):
				new_m = m.replace("\n", "<**multiLine**>\\n")
			else:
				# Just expression taking up multiple lines, make it take 1 line for easier parsing
				new_m = m.replace("\\\n", " ")
			if (num_newlines > 0):
				new_m += "\n"*(num_newlines)
			headerFileStr = headerFileStr.replace(m, new_m)

		#Filter out Extern "C" statements. These are order dependent
		headerFileStr = re.sub(r'extern( |\t)+"[Cc]"( |\t)*{', "{", headerFileStr)

		###### debug.info(headerFileStr)
		self.stack = [] # token stack to find the namespace and the element name ...
		self.nameStack = [] #
		self.braceDepth = 0
		lex.lex()
		lex.input(headerFileStr)
		global curLine
		global curChar
		curLine = 0
		curChar = 0
		while True:
			tok = lex.token()
			if not tok:
				break
			tok.value = TagStr(tok.value, lineno=tok.lineno)
			debug.debug("TOK: " + str(tok))
			self.stack.append( tok.value )
			curLine = tok.lineno
			curChar = tok.lexpos
			if (tok.type in ('PRECOMP_MACRO', 'PRECOMP_MACRO_CONT')):
				debug.debug("PRECOMP: " + str(tok))
				self.stack = []
				self.nameStack = []
				# Do nothing for macro ==> many time not needed ...
				continue
			if (tok.type == 'OPEN_BRACE'):
				# When we open a brace, this is the time to parse the stack ...
				# Clean the stack : (remove \t\r\n , and concatenate the 'xx', ':', ':', 'yy'  in 'xx::yy',
				self.nameStack = create_compleate_class_name(self.nameStack)
				if len(self.nameStack) <= 0:
					#open brace with no name ...
					debug.warning("[" + str(self.braceDepth) + "] find an empty stack ...")
				elif 'namespace' in self.nameStack:
					debug.info("[" + str(self.braceDepth) + "] find a namespace : " + str(self.nameStack));
				elif 'class' in self.nameStack:
					debug.info("[" + str(self.braceDepth) + "] find a class     : " + str(self.nameStack));
				elif 'enum' in self.nameStack:
					debug.info("[" + str(self.braceDepth) + "] find a enum      : " + str(self.nameStack));
				elif 'struct' in self.nameStack:
					debug.info("[" + str(self.braceDepth) + "] find a struct    : " + str(self.nameStack));
				elif 'typedef' in self.nameStack:
					debug.info("[" + str(self.braceDepth) + "] find a typedef   : " + str(self.nameStack));
				elif 'union' in self.nameStack:
					debug.info("[" + str(self.braceDepth) + "] find a union     : " + str(self.nameStack));
				else:
					debug.warning("[" + str(self.braceDepth) + "] find an unknow stack : " + str(self.nameStack))
				self.stack = []
				self.nameStack = []
				self.braceDepth += 1
			elif tok.type == 'CLOSE_BRACE':
				self.braceDepth -= 1
				debug.info("[" + str(self.braceDepth) + "] close brace");
				if     len(self.m_elementParseStack) != 0 \
				   and self.m_elementParseStack[len(self.m_elementParseStack)-1]['level'] == self.braceDepth :
					self.m_elementParseStack.pop()
			if tok.type == 'OPEN_PAREN':
				self.nameStack.append(tok.value)
			elif tok.type == 'CLOSE_PAREN':
				self.nameStack.append(tok.value)
			elif tok.type == 'OPEN_SQUARE_BRACKET':
				self.nameStack.append(tok.value)
			elif tok.type == 'CLOSE_SQUARE_BRACKET':
				self.nameStack.append(tok.value)
			elif tok.type == 'TAB':
				pass
			elif tok.type == 'EQUALS':
				self.nameStack.append(tok.value)
			elif tok.type == 'COMMA':
				self.nameStack.append(tok.value)
			elif tok.type == 'BACKSLASH':
				self.nameStack.append(tok.value)
			elif tok.type == 'PIPE':
				self.nameStack.append(tok.value)
			elif tok.type == 'PERCENT':
				self.nameStack.append(tok.value)
			elif tok.type == 'CARET':
				self.nameStack.append(tok.value)
			elif tok.type == 'EXCLAMATION':
				self.nameStack.append(tok.value)
			elif tok.type == 'SQUOTE':
				pass
			elif tok.type == 'NUMBER':
				self.nameStack.append(tok.value)
			elif tok.type == 'MINUS':
				self.nameStack.append(tok.value)
			elif tok.type == 'PLUS':
				self.nameStack.append(tok.value)
			elif tok.type == 'STRING_LITERAL':
				self.nameStack.append(tok.value)
			elif     tok.type == 'NAME' \
			      or tok.type == 'AMPERSTAND' \
			      or tok.type == 'ASTERISK' \
			      or tok.type == 'CHAR_LITERAL':
				self.nameStack.append(tok.value)
			elif tok.type == 'COLON':
				if self.nameStack[0] in ['private', 'protected', 'public']:
					debug.info("[" + str(self.braceDepth) + "]     change visibility : " + self.nameStack[0]);
					self.nameStack = []
					self.stack = []
				else :
					self.nameStack.append(tok.value)
			elif tok.type == 'SEMI_COLON':
				if len(self.nameStack) != 0:
					debug.info("[" + str(self.braceDepth) + "]     semicolumn : " + str(self.nameStack));
				self.stack = []
				self.nameStack = []