535 lines
20 KiB
Python
535 lines
20 KiB
Python
'''
|
|
Contains class PARSER
|
|
'''
|
|
from itertools import count
|
|
import os
|
|
from tokenizer import TOKENIZER
|
|
from lineanalyzer import ANALYZER
|
|
|
|
#Element type definitions. Used in the parse process.
|
|
ELEMENT_TYPE_PREPROCESS = 1
|
|
ELEMENT_TYPE_REGULAR = 2
|
|
|
|
TOKENS = ['TOKEN_CSTART','TOKEN_CMID','TOKEN_CEND','TOKEN_RPAREN',
|
|
'TOKEN_LPAREN','TOKEN_ENDLINE','TOKEN_RETVAL','TOKEN_TYPEDEF',
|
|
'TOKEN_IF','TOKEN_PLUS','TOKEN_MINUS','TOKEN_DIV','TOKEN_MULT',
|
|
'TOKEN_ASSIGN','TOKEN_EQUAL','TOKEN_LBRACE','TOKEN_RBRACE',
|
|
'TOKEN_COMMA','TOKEN_SEMICOLON','TOKEN_LANGLE','TOKEN_RANGLE',
|
|
'TOKEN_POINTER', 'TOKEN_STRUCT','TOKEN_ENUM','TOKEN_MACRO',
|
|
'TOKEN_FUNCTION','TOKEN_TYPEDEF_ENUM','TOKEN_TYPEDEF_STRUCT',
|
|
'TOKEN_TYPEDEF_STRUCT_STRUCT','TOKEN_TAG_NAME','TOKEN_ALIAS',
|
|
'TOKEN_ENUM']
|
|
|
|
RESERVED = {'auto' : 'AUTO','break' : 'BREAK','case' : 'CASE','char' : 'CHAR',
|
|
'const' : 'CONST','continue' : 'CONTINUE','default' : 'DEFAULT','do' : 'DO',
|
|
'int' : 'INT','long' : 'LONG','register' : 'REGISTER','return' : 'RETURN',
|
|
'short' : 'SHORT','signed' : 'SIGNED','sizeof' : 'SIZEOF','static' : 'STATIC',
|
|
'struct' : 'STRUCT','switch' : 'SWITCH','typedef' : 'TYPEDEF','union' : 'UNION',
|
|
'unsigned' : 'UNSIGNED','void' : 'VOID','volatile' : 'VOLATILE','while' : 'WHILE',
|
|
'double' : 'DOUBLE','else' : 'ELSE','enum' : 'ENUM','extern' : 'EXTERN',
|
|
'float' : 'FLOAT','for' : 'FOR','goto' : 'GOTO','if' : 'IF'}
|
|
|
|
PREPROCESSOR_DIRECTIVES = {'#include' : 'TOKEN_INCLUDE','#define' : 'TOKEN_DEFINE','#undef' : 'TOKEN_UNDEFINE',
|
|
'#if' : 'TOKEN_IF','#ifdef' : 'TOKEN_IFDEF','#ifndef' : 'TOKEN_IFNDEF','#error' : 'TOKEN_ERROR',
|
|
'__FILE__' : 'TOKEN_BASE_FILE','__LINE__' : 'TOKEN_BASE_LINE','__DATE__' : 'TOKEN_BASE_DATE',
|
|
'__TIME__' : 'TOKEN_BASE_TIME','__TIMESTAMP__' : 'TOKEN_BASE_TIMESTAMP','pragma' : 'TOKEN_PRAGMA',
|
|
'#' : 'TOKEN_HASH','##' : 'TOKEN_DOUBLEHASH','#endif' : 'TOKEN_ENDIF'}
|
|
|
|
REGULAR = {'/*' : 'TOKEN_CSTART','/**' : 'TOKEN_CSTART','*/' : 'TOKEN_CEND', '*' : 'TOKEN_CMID', '=' : 'TOKEN_ASSIGN',
|
|
'==' : 'TOKEN_EQUAL','{' : 'TOKEN_LBRACE','}' : 'TOKEN_RBRACE','};' : 'TOKEN_ENDBRACE','+' : 'TOKEN_PLUS','-' : 'TOKEN_MINUS',
|
|
'*' : 'TOKEN_MULT','/' : 'TOKEN_DIV','(' : 'TOKEN_LPAREN',')' : 'TOKEN_RPAREN',',' : 'TOKEN_COMMA',
|
|
';' : 'TOKEN_SEMICOLON','<' : 'TOKEN_LANGLE','>' : 'TOKEN_RANGLE','TYPEDEF' : 'TOKEN_TYPEDEF',
|
|
'typedef' : 'TOKEN_TYPEDEF','enum' : 'TOKEN_ENUM','ENUM' : 'TOKEN_ENUM','struct' : 'TOKEN_STRUCT',
|
|
'STRUCT' : 'TOKEN_STRUCT','char' : 'TOKEN_CHAR','CHAR' : 'TOKEN_CHAR','const' : 'TOKEN_CONST',
|
|
'CONST' : 'TOKEN_CONST','int' : 'TOKEN_INT','INT' : 'TOKEN_INT','long' : 'TOKEN_LONG','LONG' : 'TOKEN_LONG',
|
|
'short' : 'TOKEN_SHORT','SHORT' : 'TOKEN_SHORT','signed' : 'TOKEN_SIGNED','SIGNED' : 'TOKEN_SIGNED',
|
|
'unsigned' : 'TOKEN_UNSIGNED','UNSIGNED' : 'TOKEN_UNSIGNED','void' : 'TOKEN_VOID','VOID' : 'TOKEN_VOID',
|
|
'volatile' : 'TOKEN_VOLATILE','VOLATILE' : 'TOKEN_VOLATILE','double' : 'TOKEN_DOUBLE','DOUBLE' : 'TOKEN_DOUBLE',
|
|
'float' : 'TOKEN_FLOAT','FLOAT' : 'TOKEN_FLOAT', '!defined' : 'TOKEN_NOT_DEFINED', '!DEFINED' : 'TOKEN_NOT_DEFINED',
|
|
'boolean' : 'TOKEN_BOOLEAN', 'BOOLEAN' : 'TOKEN_BOOLEAN', '(*' : 'TOKEN_FUNCTION_POINTER'}
|
|
|
|
NASM_PREPROCESS_DIRECTIVES = {'#include' : '%include','#define' : '%define','#undef' : '%undef',
|
|
'#if' : '%if','#ifdef' : '%ifdef','#ifndef' : '%ifndef','#endif' : '%endif',
|
|
'#error' : '%error','__FILE__' : '__FILE__','__LINE__' : '__LINE__',
|
|
'__DATE__' : '__DATE__','__TIME__' : '__TIME__','__TIMESTAMP__' : '__TIMESTAMP__',
|
|
'pragma' : 'pragma','#' : '#','##' : '##'}
|
|
|
|
NASM_ENUM = "EQU"
|
|
|
|
NASM_REGULAR = {'/*' : ';', '*' : ';', '*/' : ''}
|
|
|
|
#REGULAR += RESERVED.values()
|
|
|
|
PARSER_TOKENS = ['PARSE_MULTILINE_COMMENT', 'PARSE_SINGLELINE_COMMENT', 'PARSE_TYPEDEF_ENUM', 'PARSE_TYPEDEF_STRUCT',
|
|
'PARSE_TYPEDEF_STRUCT_STRUCT', 'PARSE_STRUCT', 'PARSE_TAG_NAME', 'PARSE_STRUCT_MEMBER', 'PARSE_ENDSTRUCT',
|
|
'PARSE_ALIAS', 'PARSE_FUNCTION_POINTER', 'PARSE_FUNCTION', 'PARSE_IFNDEF']
|
|
|
|
COMMENT_SINGLE_LINE = 0
|
|
COMMENT_MULTI_LINE = 1
|
|
|
|
inside_member = False
|
|
inside_braces = False
|
|
inside_typedef_struct_struct = False
|
|
inside_typedef_struct = False
|
|
inside_typedef_enum = False
|
|
inside_typedef = False
|
|
inside_struct = False
|
|
inside_include = False
|
|
inside_string = False
|
|
inside_comment = False
|
|
inside_if = False
|
|
inside_ifndef = False
|
|
substitute = False
|
|
|
|
class PARSEOBJECT:
|
|
_passes = count(0)
|
|
_lineanalyzer = ANALYZER()
|
|
|
|
def __init__(self):
|
|
self.tokenize = TOKENIZER()
|
|
self.parseline = []
|
|
self.parsefile = []
|
|
self.passes = 0
|
|
|
|
def parse_reset(self):
|
|
self.parseline = []
|
|
self.parsefile = []
|
|
self._passes = count(0)
|
|
self.inside_comment = False
|
|
self.inside_typedef = False
|
|
self.typedef_enum = False
|
|
self.typedef_struct = False
|
|
self.struct_begin = False
|
|
self.enum_begin = False
|
|
self.struct = False
|
|
self.struct_end = False
|
|
|
|
def inc_passes(self):
|
|
self.passes = next(self._passes)
|
|
|
|
def parseheader(self, fl, fn):
|
|
tempfile = []
|
|
tempfile1 = []
|
|
templine = []
|
|
outfile = ''
|
|
rr = 'next'
|
|
count = 0
|
|
self.parse_reset()
|
|
i = iter(fl)
|
|
while i:
|
|
try:
|
|
rr = self._lineanalyzer.analyze(next(i))
|
|
except StopIteration:
|
|
i = False
|
|
continue
|
|
if rr == 'next':
|
|
count += 1
|
|
else:
|
|
templine.append(rr)
|
|
tempfile.append(templine)
|
|
count += 1
|
|
templine = []
|
|
self.inc_passes()
|
|
for l in tempfile:
|
|
analyzed_line = self.token_analyzer(l)
|
|
tempfile1.append(analyzed_line)
|
|
for l in tempfile1:
|
|
for w in l:
|
|
outfile += w+" "
|
|
outfile += "\n"
|
|
outputfile = os.path.splitext(fn)[0]+'.tokenized'
|
|
self.write_file(outputfile,outfile)
|
|
self.inc_passes()
|
|
tempfile = []
|
|
for l in tempfile1:
|
|
analyzed_line = self.parser_analyzer(l)
|
|
tempfile.append(analyzed_line)
|
|
for l in tempfile:
|
|
for w in l:
|
|
outfile += w+" "
|
|
outfile += "\n"
|
|
outputfile = os.path.splitext(fn)[0]+'.parsenized'
|
|
self.write_file(outputfile,outfile)
|
|
self.inc_passes()
|
|
self.parsefile = self.parsetokens(tempfile)
|
|
return self.parsefile
|
|
|
|
def parseinclude(self, data):
|
|
tempstr = str(data)
|
|
if tempstr.startswith('<'):
|
|
tempstr = tempstr.replace('<', '"')
|
|
tempstr = tempstr.replace('.h>', '.inc"')
|
|
if tempstr.endswith('.h'):
|
|
tempstr = '"'+tempstr
|
|
tempstr = tempstr.replace('.h', '.inc"')
|
|
return tempstr
|
|
|
|
def tokenizer(self, w):
|
|
global inside_comment
|
|
global inside_string
|
|
global inside_include
|
|
global inside_struct
|
|
token = ""
|
|
if w in PREPROCESSOR_DIRECTIVES:
|
|
token = PREPROCESSOR_DIRECTIVES.get(w)
|
|
return token
|
|
if w in REGULAR:
|
|
token = REGULAR.get(w)
|
|
return token
|
|
if w.startswith('/*'):
|
|
inside_comment = True
|
|
token = 'TOKEN_CSTART'
|
|
return token
|
|
if inside_comment == True:
|
|
if w.endswith('*/'):
|
|
inside_comment = False
|
|
token = 'TOKEN_CEND'
|
|
return token
|
|
if w.startswith('"'):
|
|
inside_string = True
|
|
return False
|
|
if w.endswith('"'):
|
|
inside_string = False
|
|
return False
|
|
if w.isupper():
|
|
if inside_string == True:
|
|
return False
|
|
else:
|
|
token = 'TOKEN_MACRO'
|
|
return token
|
|
if w.islower():
|
|
if inside_string == True or inside_include == True or inside_struct == True:
|
|
return False
|
|
else:
|
|
if w.startswith('(*'):
|
|
token = 'TOKEN_FUNCTION_POINTER'
|
|
return token
|
|
else:
|
|
token = 'TOKEN_FUNCTION'
|
|
return token
|
|
return False
|
|
|
|
def analyzer(self, ln):
|
|
global inside_include
|
|
global inside_typedef
|
|
global inside_typedef_enum
|
|
global inside_typedef_struct
|
|
global inside_typedef_struct_struct
|
|
global inside_braces
|
|
global inside_struct
|
|
global inside_member
|
|
analysed = []
|
|
word = [w for w in ln.split()]
|
|
for w in word:
|
|
t = self.tokenizer(w)
|
|
if t == 'TOKEN_INCLUDE':
|
|
inside_include = True
|
|
if t == 'TOKEN_TYPEDEF':
|
|
inside_typedef = True
|
|
if t == 'TOKEN_ENUM' and inside_typedef == True:
|
|
inside_typedef_enum = True
|
|
inside_typedef = False
|
|
analysed.pop(0)
|
|
analysed.insert(0,'TOKEN_TYPEDEF_ENUM')
|
|
analysed.append(w)
|
|
continue
|
|
if t == 'TOKEN_STRUCT':
|
|
if inside_typedef == True:
|
|
if ln.endswith(';\n'):
|
|
inside_typedef_struct = True
|
|
inside_typedef = False
|
|
analysed.pop(0)
|
|
analysed.insert(0,'TOKEN_TYPEDEF_STRUCT')
|
|
analysed.append(w)
|
|
continue
|
|
else:
|
|
inside_typedef_struct_struct = True
|
|
inside_typedef_struct = False
|
|
inside_typedef = False
|
|
analysed.pop(0)
|
|
analysed.insert(0,'TOKEN_TYPEDEF_STRUCT_STRUCT')
|
|
analysed.append(w)
|
|
inside_typedef_struct_struct = False #### THIS needs to be further refined!
|
|
continue
|
|
else:
|
|
inside_struct = True
|
|
analysed.append(t)
|
|
analysed.append(w)
|
|
continue
|
|
if t == 'TOKEN_LBRACE':
|
|
inside_braces = True
|
|
analysed.append(w)
|
|
continue
|
|
if t == 'TOKEN_RBRACE' and inside_struct == True:
|
|
inside_braces = False
|
|
inside_struct = False
|
|
analysed.append(t)
|
|
analysed.append(w)
|
|
continue
|
|
if inside_braces == True and inside_struct == True:
|
|
if inside_member == True:
|
|
inside_member = False
|
|
analysed.append(w)
|
|
continue
|
|
else:
|
|
t = 'TOKEN_MEMBER'
|
|
inside_member = True
|
|
analysed.append(t)
|
|
analysed.append(w)
|
|
continue
|
|
if t == False:
|
|
analysed.append(w)
|
|
continue
|
|
else:
|
|
analysed.append(t)
|
|
analysed.append(w)
|
|
inside_include = False
|
|
inside_struct = False
|
|
return analysed
|
|
|
|
def token_analyzer(self, ln):
|
|
global inside_comment
|
|
analyzed = []
|
|
for w in ln:
|
|
if w == 'TOKEN_CSTART':
|
|
inside_comment = True
|
|
analyzed.append(w)
|
|
continue
|
|
if inside_comment == True:
|
|
if w == 'TOKEN_MULT':
|
|
analyzed.append('TOKEN_CMID')
|
|
continue
|
|
else:
|
|
if w == 'TOKEN_CEND':
|
|
analyzed.append(w)
|
|
inside_comment = False
|
|
continue
|
|
else:
|
|
if w.startswith('TOKEN'):
|
|
continue
|
|
analyzed.append(w)
|
|
return analyzed
|
|
|
|
def parser_analyzer(self, ln):
|
|
global inside_comment
|
|
global inside_if
|
|
global inside_ifndef
|
|
global substitute
|
|
analyzed = []
|
|
subst = []
|
|
for w in ln:
|
|
if w == 'TOKEN_CSTART':
|
|
inside_comment = True
|
|
if ln[-1] != 'TOKEN_CEND':
|
|
analyzed.append('PARSE_MULTILINE_COMMENT')
|
|
continue
|
|
else:
|
|
analyzed.append('PARSE_SINGLELINE_COMMENT')
|
|
continue
|
|
if w == 'TOKEN_CMID':
|
|
analyzed.append('PARSE_MULTILINE_COMMENT')
|
|
continue
|
|
if w == 'TOKEN_CEND':
|
|
inside_comment = False
|
|
continue
|
|
if inside_comment == False:
|
|
if w == '*/':
|
|
continue
|
|
if w == 'TOKEN_IF':
|
|
inside_if = True
|
|
analyzed.append('PARSE_IF')
|
|
continue
|
|
if inside_if == True:
|
|
if w == 'TOKEN_NOT_DEFINED':
|
|
substitute = True
|
|
inside_if = False
|
|
inside_ifndef = True
|
|
subst.append('PARSE_IFNDEF')
|
|
continue
|
|
else:
|
|
analyzed.append(w)
|
|
continue
|
|
if substitute == True:
|
|
subst.append(w)
|
|
continue
|
|
else:
|
|
analyzed.append(w)
|
|
continue
|
|
return analyzed
|
|
|
|
def parsetokens(self, fl):
|
|
templine = []
|
|
tempfile = []
|
|
enum_cnt = 0
|
|
|
|
for l in fl:
|
|
templine = []
|
|
tempstr = ""
|
|
if l == []:
|
|
templine.append("")
|
|
tempfile.append(templine)
|
|
continue
|
|
if "TOKEN_CSTART" in l:
|
|
self.inside_comment = True
|
|
tempfile.append(self.parse_comment(l))
|
|
continue
|
|
if "TOKEN_CMID" in l:
|
|
self.inside_comment = True
|
|
tempfile.append(self.parse_comment(l))
|
|
continue
|
|
if "TOKEN_CEND" in l:
|
|
self.inside_comment = True
|
|
tempfile.append(self.parse_comment(l))
|
|
continue
|
|
if "TYPEDEF" in l:
|
|
self.parse_typedef(l)
|
|
if self.typedef_enum == False and self.typedef_struct == False:
|
|
templine.append("; ")
|
|
for e in l:
|
|
templine.append(e)
|
|
tempfile.append(templine)
|
|
if self.typedef_struct == True:
|
|
templine.append('struc')
|
|
templine.append(l[-1][:-1])
|
|
tempfile.append(templine)
|
|
templine = []
|
|
templine.append('endstruc')
|
|
tempfile.append(templine)
|
|
continue
|
|
if "typedef" in l:
|
|
self.parse_typedef(l)
|
|
if self.typedef_enum == False and self.typedef_struct == False:
|
|
templine.append("; ")
|
|
for e in l:
|
|
templine.append(e)
|
|
tempfile.append(templine)
|
|
if self.typedef_struct == True:
|
|
templine.append('struc')
|
|
templine.append(l[-1][:-1])
|
|
tempfile.append(templine)
|
|
templine = []
|
|
templine.append('endstruc')
|
|
tempfile.append(templine)
|
|
continue
|
|
if "struct" in l:
|
|
self.parse_struct(l)
|
|
|
|
if "TOKEN_PREPROCESS" in l:
|
|
tempfile.append(self.parse_preprocess(l))
|
|
continue
|
|
if self.inside_typedef == True:
|
|
if self.typedef_enum == True:
|
|
if l[0] == "TOKEN_LBRACE" and len(l) == 2:
|
|
self.enum_begin = True
|
|
enum_cnt = 0
|
|
continue
|
|
if len(l) == 1:
|
|
if l[0].endswith(","):
|
|
tempstr = l[0]
|
|
templine.append(tempstr[:-1]+"\t")
|
|
templine.append("EQU\t")
|
|
templine.append(str(enum_cnt))
|
|
tempfile.append(templine)
|
|
enum_cnt += 1
|
|
continue
|
|
else:
|
|
templine.append(l[0]+"\t")
|
|
templine.append("EQU\t")
|
|
templine.append(str(enum_cnt))
|
|
tempfile.append(templine)
|
|
continue
|
|
if len(l) == 3:
|
|
if l[0].endswith(","):
|
|
tempstr = l[0]
|
|
enum_cnt = l[2]
|
|
templine.append(tempstr[:-1]+"\t")
|
|
templine.append("EQU"+"\t")
|
|
templine.append(enum_cnt)
|
|
tempfile.append(templine)
|
|
continue
|
|
if l[0] == "TOKEN_RBRACE" and len(l) == 3:
|
|
self.enum_begin = False
|
|
self.typedef_enum = False
|
|
self.inside_typedef = False
|
|
enum_cnt = 0
|
|
continue
|
|
return tempfile
|
|
|
|
def parse_struct(self, l):
|
|
templine = []
|
|
for w in l:
|
|
if w == "struct":
|
|
self.struct = True
|
|
templine.append('struc')
|
|
continue
|
|
if w != "":
|
|
templine.append(w)
|
|
continue
|
|
if w == "{" and self.struct == True:
|
|
self.struct_begin = True
|
|
continue
|
|
return templine
|
|
|
|
def parse_typedef(self, l):
|
|
templine = []
|
|
for w in l:
|
|
if w == "TYPEDEF" or w == "typedef":
|
|
self.inside_typedef = True
|
|
continue
|
|
if w == "ENUM" or w == "enum":
|
|
self.typedef_enum = True
|
|
self.typedef_struct = False
|
|
continue
|
|
if w == "STRUCT" or w == "struct":
|
|
self.typedef_struct = True
|
|
self.typedef_enum = False
|
|
continue
|
|
|
|
def parse_comment(self, l):
|
|
templine = []
|
|
for w in l:
|
|
if w in TOKENS:
|
|
continue
|
|
if w in NASM_REGULAR:
|
|
templine.append(NASM_REGULAR.get(w))
|
|
continue
|
|
templine.append(w)
|
|
return templine
|
|
|
|
def parse_preprocess(self, l):
|
|
newline = []
|
|
for w in l:
|
|
if w in TOKENS:
|
|
continue
|
|
if w in PREPROCESSOR_DIRECTIVES:
|
|
newline.append(NASM_PREPROCESS_DIRECTIVES.get(w))
|
|
continue
|
|
if w.startswith("<"):
|
|
newline.append(self.parseinclude(w))
|
|
continue
|
|
if w in NASM_REGULAR:
|
|
newline.append(NASM_REGULAR.get(w))
|
|
continue
|
|
newline.append(w)
|
|
return newline
|
|
|
|
def write_file(self, fn, data):
|
|
if not os.path.exists(os.path.dirname(fn)):
|
|
try:
|
|
os.makedirs(os.path.dirname(fn))
|
|
except OSError as exc: # Guard against race condition
|
|
if exc.errno != errno.EEXIST:
|
|
raise
|
|
newfile = open(fn, "w")
|
|
newfile.write(data)
|
|
newfile.close()
|
|
|
|
class PARSER(PARSEOBJECT):
|
|
_ids = count(0)
|
|
_passes = count(0)
|
|
|
|
def __init__(self):
|
|
self.id = next(self._ids)
|
|
self.tupline = []
|
|
self.tupfile = []
|
|
self.passes = next(self._passes) |