288 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			288 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python
 | |
| #
 | |
| # Copyright 2007 Neal Norwitz
 | |
| # Portions Copyright 2007 Google Inc.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #      http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| 
 | |
| """Tokenize C++ source code."""
 | |
| 
 | |
| __author__ = 'nnorwitz@google.com (Neal Norwitz)'
 | |
| 
 | |
| 
 | |
| try:
 | |
|     # Python 3.x
 | |
|     import builtins
 | |
| except ImportError:
 | |
|     # Python 2.x
 | |
|     import __builtin__ as builtins
 | |
| 
 | |
| 
 | |
| import sys
 | |
| 
 | |
| from cpp import utils
 | |
| 
 | |
| 
 | |
| if not hasattr(builtins, 'set'):
 | |
|     # Nominal support for Python 2.3.
 | |
|     from sets import Set as set
 | |
| 
 | |
| 
 | |
| # Add $ as a valid identifier char since so much code uses it.
 | |
| _letters = 'abcdefghijklmnopqrstuvwxyz'
 | |
| VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
 | |
| HEX_DIGITS = set('0123456789abcdefABCDEF')
 | |
| INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
 | |
| 
 | |
| 
 | |
| # C++0x string preffixes.
 | |
| _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
 | |
| 
 | |
| 
 | |
| # Token types.
 | |
| UNKNOWN = 'UNKNOWN'
 | |
| SYNTAX = 'SYNTAX'
 | |
| CONSTANT = 'CONSTANT'
 | |
| NAME = 'NAME'
 | |
| PREPROCESSOR = 'PREPROCESSOR'
 | |
| 
 | |
| # Where the token originated from.  This can be used for backtracking.
 | |
| # It is always set to WHENCE_STREAM in this code.
 | |
| WHENCE_STREAM, WHENCE_QUEUE = range(2)
 | |
| 
 | |
| 
 | |
| class Token(object):
 | |
|     """Data container to represent a C++ token.
 | |
| 
 | |
|     Tokens can be identifiers, syntax char(s), constants, or
 | |
|     pre-processor directives.
 | |
| 
 | |
|     start contains the index of the first char of the token in the source
 | |
|     end contains the index of the last char of the token in the source
 | |
|     """
 | |
| 
 | |
|     def __init__(self, token_type, name, start, end):
 | |
|         self.token_type = token_type
 | |
|         self.name = name
 | |
|         self.start = start
 | |
|         self.end = end
 | |
|         self.whence = WHENCE_STREAM
 | |
| 
 | |
|     def __str__(self):
 | |
|         if not utils.DEBUG:
 | |
|             return 'Token(%r)' % self.name
 | |
|         return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
 | |
| 
 | |
|     __repr__ = __str__
 | |
| 
 | |
| 
 | |
| def _GetString(source, start, i):
 | |
|     i = source.find('"', i+1)
 | |
|     while source[i-1] == '\\':
 | |
|         # Count the trailing backslashes.
 | |
|         backslash_count = 1
 | |
|         j = i - 2
 | |
|         while source[j] == '\\':
 | |
|             backslash_count += 1
 | |
|             j -= 1
 | |
|         # When trailing backslashes are even, they escape each other.
 | |
|         if (backslash_count % 2) == 0:
 | |
|             break
 | |
|         i = source.find('"', i+1)
 | |
|     return i + 1
 | |
| 
 | |
| 
 | |
| def _GetChar(source, start, i):
 | |
|     # NOTE(nnorwitz): may not be quite correct, should be good enough.
 | |
|     i = source.find("'", i+1)
 | |
|     while source[i-1] == '\\':
 | |
|         # Need to special case '\\'.
 | |
|         if (i - 2) > start and source[i-2] == '\\':
 | |
|             break
 | |
|         i = source.find("'", i+1)
 | |
|     # Try to handle unterminated single quotes (in a #if 0 block).
 | |
|     if i < 0:
 | |
|         i = start
 | |
|     return i + 1
 | |
| 
 | |
| 
 | |
| def GetTokens(source):
 | |
|     """Returns a sequence of Tokens.
 | |
| 
 | |
|     Args:
 | |
|       source: string of C++ source code.
 | |
| 
 | |
|     Yields:
 | |
|       Token that represents the next token in the source.
 | |
|     """
 | |
|     # Cache various valid character sets for speed.
 | |
|     valid_identifier_chars = VALID_IDENTIFIER_CHARS
 | |
|     hex_digits = HEX_DIGITS
 | |
|     int_or_float_digits = INT_OR_FLOAT_DIGITS
 | |
|     int_or_float_digits2 = int_or_float_digits | set('.')
 | |
| 
 | |
|     # Only ignore errors while in a #if 0 block.
 | |
|     ignore_errors = False
 | |
|     count_ifs = 0
 | |
| 
 | |
|     i = 0
 | |
|     end = len(source)
 | |
|     while i < end:
 | |
|         # Skip whitespace.
 | |
|         while i < end and source[i].isspace():
 | |
|             i += 1
 | |
|         if i >= end:
 | |
|             return
 | |
| 
 | |
|         token_type = UNKNOWN
 | |
|         start = i
 | |
|         c = source[i]
 | |
|         if c.isalpha() or c == '_':              # Find a string token.
 | |
|             token_type = NAME
 | |
|             while source[i] in valid_identifier_chars:
 | |
|                 i += 1
 | |
|             # String and character constants can look like a name if
 | |
|             # they are something like L"".
 | |
|             if (source[i] == "'" and (i - start) == 1 and
 | |
|                 source[start:i] in 'uUL'):
 | |
|                 # u, U, and L are valid C++0x character preffixes.
 | |
|                 token_type = CONSTANT
 | |
|                 i = _GetChar(source, start, i)
 | |
|             elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
 | |
|                 token_type = CONSTANT
 | |
|                 i = _GetString(source, start, i)
 | |
|         elif c == '/' and source[i+1] == '/':    # Find // comments.
 | |
|             i = source.find('\n', i)
 | |
|             if i == -1:  # Handle EOF.
 | |
|                 i = end
 | |
|             continue
 | |
|         elif c == '/' and source[i+1] == '*':    # Find /* comments. */
 | |
|             i = source.find('*/', i) + 2
 | |
|             continue
 | |
|         elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
 | |
|             token_type = SYNTAX
 | |
|             i += 1
 | |
|             new_ch = source[i]
 | |
|             if new_ch == c and c != '>':         # Treat ">>" as two tokens.
 | |
|                 i += 1
 | |
|             elif c == '-' and new_ch == '>':
 | |
|                 i += 1
 | |
|             elif new_ch == '=':
 | |
|                 i += 1
 | |
|         elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
 | |
|             token_type = SYNTAX
 | |
|             i += 1
 | |
|             if c == '.' and source[i].isdigit():
 | |
|                 token_type = CONSTANT
 | |
|                 i += 1
 | |
|                 while source[i] in int_or_float_digits:
 | |
|                     i += 1
 | |
|                 # Handle float suffixes.
 | |
|                 for suffix in ('l', 'f'):
 | |
|                     if suffix == source[i:i+1].lower():
 | |
|                         i += 1
 | |
|                         break
 | |
|         elif c.isdigit():                        # Find integer.
 | |
|             token_type = CONSTANT
 | |
|             if c == '0' and source[i+1] in 'xX':
 | |
|                 # Handle hex digits.
 | |
|                 i += 2
 | |
|                 while source[i] in hex_digits:
 | |
|                     i += 1
 | |
|             else:
 | |
|                 while source[i] in int_or_float_digits2:
 | |
|                     i += 1
 | |
|             # Handle integer (and float) suffixes.
 | |
|             for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
 | |
|                 size = len(suffix)
 | |
|                 if suffix == source[i:i+size].lower():
 | |
|                     i += size
 | |
|                     break
 | |
|         elif c == '"':                           # Find string.
 | |
|             token_type = CONSTANT
 | |
|             i = _GetString(source, start, i)
 | |
|         elif c == "'":                           # Find char.
 | |
|             token_type = CONSTANT
 | |
|             i = _GetChar(source, start, i)
 | |
|         elif c == '#':                           # Find pre-processor command.
 | |
|             token_type = PREPROCESSOR
 | |
|             got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
 | |
|             if got_if:
 | |
|                 count_ifs += 1
 | |
|             elif source[i:i+6] == '#endif':
 | |
|                 count_ifs -= 1
 | |
|                 if count_ifs == 0:
 | |
|                     ignore_errors = False
 | |
| 
 | |
|             # TODO(nnorwitz): handle preprocessor statements (\ continuations).
 | |
|             while 1:
 | |
|                 i1 = source.find('\n', i)
 | |
|                 i2 = source.find('//', i)
 | |
|                 i3 = source.find('/*', i)
 | |
|                 i4 = source.find('"', i)
 | |
|                 # NOTE(nnorwitz): doesn't handle comments in #define macros.
 | |
|                 # Get the first important symbol (newline, comment, EOF/end).
 | |
|                 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
 | |
| 
 | |
|                 # Handle #include "dir//foo.h" properly.
 | |
|                 if source[i] == '"':
 | |
|                     i = source.find('"', i+1) + 1
 | |
|                     assert i > 0
 | |
|                     continue
 | |
|                 # Keep going if end of the line and the line ends with \.
 | |
|                 if not (i == i1 and source[i-1] == '\\'):
 | |
|                     if got_if:
 | |
|                         condition = source[start+4:i].lstrip()
 | |
|                         if (condition.startswith('0') or
 | |
|                             condition.startswith('(0)')):
 | |
|                             ignore_errors = True
 | |
|                     break
 | |
|                 i += 1
 | |
|         elif c == '\\':                          # Handle \ in code.
 | |
|             # This is different from the pre-processor \ handling.
 | |
|             i += 1
 | |
|             continue
 | |
|         elif ignore_errors:
 | |
|             # The tokenizer seems to be in pretty good shape.  This
 | |
|             # raise is conditionally disabled so that bogus code
 | |
|             # in an #if 0 block can be handled.  Since we will ignore
 | |
|             # it anyways, this is probably fine.  So disable the
 | |
|             # exception and  return the bogus char.
 | |
|             i += 1
 | |
|         else:
 | |
|             sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
 | |
|                              ('?', i, c, source[i-10:i+10]))
 | |
|             raise RuntimeError('unexpected token')
 | |
| 
 | |
|         if i <= 0:
 | |
|             print('Invalid index, exiting now.')
 | |
|             return
 | |
|         yield Token(token_type, source[start:i], start, i)
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     def main(argv):
 | |
|         """Driver mostly for testing purposes."""
 | |
|         for filename in argv[1:]:
 | |
|             source = utils.ReadFile(filename)
 | |
|             if source is None:
 | |
|                 continue
 | |
| 
 | |
|             for token in GetTokens(source):
 | |
|                 print('%-12s: %s' % (token.token_type, token.name))
 | |
|                 # print('\r%6.2f%%' % (100.0 * index / token.end),)
 | |
|             sys.stdout.write('\n')
 | |
| 
 | |
| 
 | |
|     main(sys.argv)
 | 
