Files
h2inc/parser.py
2019-02-21 22:58:13 +01:00

535 lines
20 KiB
Python

'''
Contains class PARSER
'''
from itertools import count
import os
from tokenizer import TOKENIZER
from lineanalyzer import ANALYZER
#Element type definitions. Used in the parse process.
ELEMENT_TYPE_PREPROCESS = 1
ELEMENT_TYPE_REGULAR = 2
TOKENS = ['TOKEN_CSTART','TOKEN_CMID','TOKEN_CEND','TOKEN_RPAREN',
'TOKEN_LPAREN','TOKEN_ENDLINE','TOKEN_RETVAL','TOKEN_TYPEDEF',
'TOKEN_IF','TOKEN_PLUS','TOKEN_MINUS','TOKEN_DIV','TOKEN_MULT',
'TOKEN_ASSIGN','TOKEN_EQUAL','TOKEN_LBRACE','TOKEN_RBRACE',
'TOKEN_COMMA','TOKEN_SEMICOLON','TOKEN_LANGLE','TOKEN_RANGLE',
'TOKEN_POINTER', 'TOKEN_STRUCT','TOKEN_ENUM','TOKEN_MACRO',
'TOKEN_FUNCTION','TOKEN_TYPEDEF_ENUM','TOKEN_TYPEDEF_STRUCT',
'TOKEN_TYPEDEF_STRUCT_STRUCT','TOKEN_TAG_NAME','TOKEN_ALIAS',
'TOKEN_ENUM']
RESERVED = {'auto' : 'AUTO','break' : 'BREAK','case' : 'CASE','char' : 'CHAR',
'const' : 'CONST','continue' : 'CONTINUE','default' : 'DEFAULT','do' : 'DO',
'int' : 'INT','long' : 'LONG','register' : 'REGISTER','return' : 'RETURN',
'short' : 'SHORT','signed' : 'SIGNED','sizeof' : 'SIZEOF','static' : 'STATIC',
'struct' : 'STRUCT','switch' : 'SWITCH','typedef' : 'TYPEDEF','union' : 'UNION',
'unsigned' : 'UNSIGNED','void' : 'VOID','volatile' : 'VOLATILE','while' : 'WHILE',
'double' : 'DOUBLE','else' : 'ELSE','enum' : 'ENUM','extern' : 'EXTERN',
'float' : 'FLOAT','for' : 'FOR','goto' : 'GOTO','if' : 'IF'}
PREPROCESSOR_DIRECTIVES = {'#include' : 'TOKEN_INCLUDE','#define' : 'TOKEN_DEFINE','#undef' : 'TOKEN_UNDEFINE',
'#if' : 'TOKEN_IF','#ifdef' : 'TOKEN_IFDEF','#ifndef' : 'TOKEN_IFNDEF','#error' : 'TOKEN_ERROR',
'__FILE__' : 'TOKEN_BASE_FILE','__LINE__' : 'TOKEN_BASE_LINE','__DATE__' : 'TOKEN_BASE_DATE',
'__TIME__' : 'TOKEN_BASE_TIME','__TIMESTAMP__' : 'TOKEN_BASE_TIMESTAMP','pragma' : 'TOKEN_PRAGMA',
'#' : 'TOKEN_HASH','##' : 'TOKEN_DOUBLEHASH','#endif' : 'TOKEN_ENDIF'}
REGULAR = {'/*' : 'TOKEN_CSTART','/**' : 'TOKEN_CSTART','*/' : 'TOKEN_CEND', '*' : 'TOKEN_CMID', '=' : 'TOKEN_ASSIGN',
'==' : 'TOKEN_EQUAL','{' : 'TOKEN_LBRACE','}' : 'TOKEN_RBRACE','};' : 'TOKEN_ENDBRACE','+' : 'TOKEN_PLUS','-' : 'TOKEN_MINUS',
'*' : 'TOKEN_MULT','/' : 'TOKEN_DIV','(' : 'TOKEN_LPAREN',')' : 'TOKEN_RPAREN',',' : 'TOKEN_COMMA',
';' : 'TOKEN_SEMICOLON','<' : 'TOKEN_LANGLE','>' : 'TOKEN_RANGLE','TYPEDEF' : 'TOKEN_TYPEDEF',
'typedef' : 'TOKEN_TYPEDEF','enum' : 'TOKEN_ENUM','ENUM' : 'TOKEN_ENUM','struct' : 'TOKEN_STRUCT',
'STRUCT' : 'TOKEN_STRUCT','char' : 'TOKEN_CHAR','CHAR' : 'TOKEN_CHAR','const' : 'TOKEN_CONST',
'CONST' : 'TOKEN_CONST','int' : 'TOKEN_INT','INT' : 'TOKEN_INT','long' : 'TOKEN_LONG','LONG' : 'TOKEN_LONG',
'short' : 'TOKEN_SHORT','SHORT' : 'TOKEN_SHORT','signed' : 'TOKEN_SIGNED','SIGNED' : 'TOKEN_SIGNED',
'unsigned' : 'TOKEN_UNSIGNED','UNSIGNED' : 'TOKEN_UNSIGNED','void' : 'TOKEN_VOID','VOID' : 'TOKEN_VOID',
'volatile' : 'TOKEN_VOLATILE','VOLATILE' : 'TOKEN_VOLATILE','double' : 'TOKEN_DOUBLE','DOUBLE' : 'TOKEN_DOUBLE',
'float' : 'TOKEN_FLOAT','FLOAT' : 'TOKEN_FLOAT', '!defined' : 'TOKEN_NOT_DEFINED', '!DEFINED' : 'TOKEN_NOT_DEFINED',
'boolean' : 'TOKEN_BOOLEAN', 'BOOLEAN' : 'TOKEN_BOOLEAN', '(*' : 'TOKEN_FUNCTION_POINTER'}
NASM_PREPROCESS_DIRECTIVES = {'#include' : '%include','#define' : '%define','#undef' : '%undef',
'#if' : '%if','#ifdef' : '%ifdef','#ifndef' : '%ifndef','#endif' : '%endif',
'#error' : '%error','__FILE__' : '__FILE__','__LINE__' : '__LINE__',
'__DATE__' : '__DATE__','__TIME__' : '__TIME__','__TIMESTAMP__' : '__TIMESTAMP__',
'pragma' : 'pragma','#' : '#','##' : '##'}
NASM_ENUM = "EQU"
NASM_REGULAR = {'/*' : ';', '*' : ';', '*/' : ''}
#REGULAR += RESERVED.values()
PARSER_TOKENS = ['PARSE_MULTILINE_COMMENT', 'PARSE_SINGLELINE_COMMENT', 'PARSE_TYPEDEF_ENUM', 'PARSE_TYPEDEF_STRUCT',
'PARSE_TYPEDEF_STRUCT_STRUCT', 'PARSE_STRUCT', 'PARSE_TAG_NAME', 'PARSE_STRUCT_MEMBER', 'PARSE_ENDSTRUCT',
'PARSE_ALIAS', 'PARSE_FUNCTION_POINTER', 'PARSE_FUNCTION', 'PARSE_IFNDEF']
COMMENT_SINGLE_LINE = 0
COMMENT_MULTI_LINE = 1
inside_member = False
inside_braces = False
inside_typedef_struct_struct = False
inside_typedef_struct = False
inside_typedef_enum = False
inside_typedef = False
inside_struct = False
inside_include = False
inside_string = False
inside_comment = False
inside_if = False
inside_ifndef = False
substitute = False
class PARSEOBJECT:
_passes = count(0)
_lineanalyzer = ANALYZER()
def __init__(self):
self.tokenize = TOKENIZER()
self.parseline = []
self.parsefile = []
self.passes = 0
def parse_reset(self):
self.parseline = []
self.parsefile = []
self._passes = count(0)
self.inside_comment = False
self.inside_typedef = False
self.typedef_enum = False
self.typedef_struct = False
self.struct_begin = False
self.enum_begin = False
self.struct = False
self.struct_end = False
def inc_passes(self):
self.passes = next(self._passes)
def parseheader(self, fl, fn):
tempfile = []
tempfile1 = []
templine = []
outfile = ''
rr = 'next'
count = 0
self.parse_reset()
i = iter(fl)
while i:
try:
rr = self._lineanalyzer.analyze(next(i))
except StopIteration:
i = False
continue
if rr == 'next':
count += 1
else:
templine.append(rr)
tempfile.append(templine)
count += 1
templine = []
self.inc_passes()
for l in tempfile:
analyzed_line = self.token_analyzer(l)
tempfile1.append(analyzed_line)
for l in tempfile1:
for w in l:
outfile += w+" "
outfile += "\n"
outputfile = os.path.splitext(fn)[0]+'.tokenized'
self.write_file(outputfile,outfile)
self.inc_passes()
tempfile = []
for l in tempfile1:
analyzed_line = self.parser_analyzer(l)
tempfile.append(analyzed_line)
for l in tempfile:
for w in l:
outfile += w+" "
outfile += "\n"
outputfile = os.path.splitext(fn)[0]+'.parsenized'
self.write_file(outputfile,outfile)
self.inc_passes()
self.parsefile = self.parsetokens(tempfile)
return self.parsefile
def parseinclude(self, data):
tempstr = str(data)
if tempstr.startswith('<'):
tempstr = tempstr.replace('<', '"')
tempstr = tempstr.replace('.h>', '.inc"')
if tempstr.endswith('.h'):
tempstr = '"'+tempstr
tempstr = tempstr.replace('.h', '.inc"')
return tempstr
def tokenizer(self, w):
global inside_comment
global inside_string
global inside_include
global inside_struct
token = ""
if w in PREPROCESSOR_DIRECTIVES:
token = PREPROCESSOR_DIRECTIVES.get(w)
return token
if w in REGULAR:
token = REGULAR.get(w)
return token
if w.startswith('/*'):
inside_comment = True
token = 'TOKEN_CSTART'
return token
if inside_comment == True:
if w.endswith('*/'):
inside_comment = False
token = 'TOKEN_CEND'
return token
if w.startswith('"'):
inside_string = True
return False
if w.endswith('"'):
inside_string = False
return False
if w.isupper():
if inside_string == True:
return False
else:
token = 'TOKEN_MACRO'
return token
if w.islower():
if inside_string == True or inside_include == True or inside_struct == True:
return False
else:
if w.startswith('(*'):
token = 'TOKEN_FUNCTION_POINTER'
return token
else:
token = 'TOKEN_FUNCTION'
return token
return False
def analyzer(self, ln):
global inside_include
global inside_typedef
global inside_typedef_enum
global inside_typedef_struct
global inside_typedef_struct_struct
global inside_braces
global inside_struct
global inside_member
analysed = []
word = [w for w in ln.split()]
for w in word:
t = self.tokenizer(w)
if t == 'TOKEN_INCLUDE':
inside_include = True
if t == 'TOKEN_TYPEDEF':
inside_typedef = True
if t == 'TOKEN_ENUM' and inside_typedef == True:
inside_typedef_enum = True
inside_typedef = False
analysed.pop(0)
analysed.insert(0,'TOKEN_TYPEDEF_ENUM')
analysed.append(w)
continue
if t == 'TOKEN_STRUCT':
if inside_typedef == True:
if ln.endswith(';\n'):
inside_typedef_struct = True
inside_typedef = False
analysed.pop(0)
analysed.insert(0,'TOKEN_TYPEDEF_STRUCT')
analysed.append(w)
continue
else:
inside_typedef_struct_struct = True
inside_typedef_struct = False
inside_typedef = False
analysed.pop(0)
analysed.insert(0,'TOKEN_TYPEDEF_STRUCT_STRUCT')
analysed.append(w)
inside_typedef_struct_struct = False #### THIS needs to be further refined!
continue
else:
inside_struct = True
analysed.append(t)
analysed.append(w)
continue
if t == 'TOKEN_LBRACE':
inside_braces = True
analysed.append(w)
continue
if t == 'TOKEN_RBRACE' and inside_struct == True:
inside_braces = False
inside_struct = False
analysed.append(t)
analysed.append(w)
continue
if inside_braces == True and inside_struct == True:
if inside_member == True:
inside_member = False
analysed.append(w)
continue
else:
t = 'TOKEN_MEMBER'
inside_member = True
analysed.append(t)
analysed.append(w)
continue
if t == False:
analysed.append(w)
continue
else:
analysed.append(t)
analysed.append(w)
inside_include = False
inside_struct = False
return analysed
def token_analyzer(self, ln):
global inside_comment
analyzed = []
for w in ln:
if w == 'TOKEN_CSTART':
inside_comment = True
analyzed.append(w)
continue
if inside_comment == True:
if w == 'TOKEN_MULT':
analyzed.append('TOKEN_CMID')
continue
else:
if w == 'TOKEN_CEND':
analyzed.append(w)
inside_comment = False
continue
else:
if w.startswith('TOKEN'):
continue
analyzed.append(w)
return analyzed
def parser_analyzer(self, ln):
global inside_comment
global inside_if
global inside_ifndef
global substitute
analyzed = []
subst = []
for w in ln:
if w == 'TOKEN_CSTART':
inside_comment = True
if ln[-1] != 'TOKEN_CEND':
analyzed.append('PARSE_MULTILINE_COMMENT')
continue
else:
analyzed.append('PARSE_SINGLELINE_COMMENT')
continue
if w == 'TOKEN_CMID':
analyzed.append('PARSE_MULTILINE_COMMENT')
continue
if w == 'TOKEN_CEND':
inside_comment = False
continue
if inside_comment == False:
if w == '*/':
continue
if w == 'TOKEN_IF':
inside_if = True
analyzed.append('PARSE_IF')
continue
if inside_if == True:
if w == 'TOKEN_NOT_DEFINED':
substitute = True
inside_if = False
inside_ifndef = True
subst.append('PARSE_IFNDEF')
continue
else:
analyzed.append(w)
continue
if substitute == True:
subst.append(w)
continue
else:
analyzed.append(w)
continue
return analyzed
def parsetokens(self, fl):
templine = []
tempfile = []
enum_cnt = 0
for l in fl:
templine = []
tempstr = ""
if l == []:
templine.append("")
tempfile.append(templine)
continue
if "TOKEN_CSTART" in l:
self.inside_comment = True
tempfile.append(self.parse_comment(l))
continue
if "TOKEN_CMID" in l:
self.inside_comment = True
tempfile.append(self.parse_comment(l))
continue
if "TOKEN_CEND" in l:
self.inside_comment = True
tempfile.append(self.parse_comment(l))
continue
if "TYPEDEF" in l:
self.parse_typedef(l)
if self.typedef_enum == False and self.typedef_struct == False:
templine.append("; ")
for e in l:
templine.append(e)
tempfile.append(templine)
if self.typedef_struct == True:
templine.append('struc')
templine.append(l[-1][:-1])
tempfile.append(templine)
templine = []
templine.append('endstruc')
tempfile.append(templine)
continue
if "typedef" in l:
self.parse_typedef(l)
if self.typedef_enum == False and self.typedef_struct == False:
templine.append("; ")
for e in l:
templine.append(e)
tempfile.append(templine)
if self.typedef_struct == True:
templine.append('struc')
templine.append(l[-1][:-1])
tempfile.append(templine)
templine = []
templine.append('endstruc')
tempfile.append(templine)
continue
if "struct" in l:
self.parse_struct(l)
if "TOKEN_PREPROCESS" in l:
tempfile.append(self.parse_preprocess(l))
continue
if self.inside_typedef == True:
if self.typedef_enum == True:
if l[0] == "TOKEN_LBRACE" and len(l) == 2:
self.enum_begin = True
enum_cnt = 0
continue
if len(l) == 1:
if l[0].endswith(","):
tempstr = l[0]
templine.append(tempstr[:-1]+"\t")
templine.append("EQU\t")
templine.append(str(enum_cnt))
tempfile.append(templine)
enum_cnt += 1
continue
else:
templine.append(l[0]+"\t")
templine.append("EQU\t")
templine.append(str(enum_cnt))
tempfile.append(templine)
continue
if len(l) == 3:
if l[0].endswith(","):
tempstr = l[0]
enum_cnt = l[2]
templine.append(tempstr[:-1]+"\t")
templine.append("EQU"+"\t")
templine.append(enum_cnt)
tempfile.append(templine)
continue
if l[0] == "TOKEN_RBRACE" and len(l) == 3:
self.enum_begin = False
self.typedef_enum = False
self.inside_typedef = False
enum_cnt = 0
continue
return tempfile
def parse_struct(self, l):
templine = []
for w in l:
if w == "struct":
self.struct = True
templine.append('struc')
continue
if w != "":
templine.append(w)
continue
if w == "{" and self.struct == True:
self.struct_begin = True
continue
return templine
def parse_typedef(self, l):
templine = []
for w in l:
if w == "TYPEDEF" or w == "typedef":
self.inside_typedef = True
continue
if w == "ENUM" or w == "enum":
self.typedef_enum = True
self.typedef_struct = False
continue
if w == "STRUCT" or w == "struct":
self.typedef_struct = True
self.typedef_enum = False
continue
def parse_comment(self, l):
templine = []
for w in l:
if w in TOKENS:
continue
if w in NASM_REGULAR:
templine.append(NASM_REGULAR.get(w))
continue
templine.append(w)
return templine
def parse_preprocess(self, l):
newline = []
for w in l:
if w in TOKENS:
continue
if w in PREPROCESSOR_DIRECTIVES:
newline.append(NASM_PREPROCESS_DIRECTIVES.get(w))
continue
if w.startswith("<"):
newline.append(self.parseinclude(w))
continue
if w in NASM_REGULAR:
newline.append(NASM_REGULAR.get(w))
continue
newline.append(w)
return newline
def write_file(self, fn, data):
if not os.path.exists(os.path.dirname(fn)):
try:
os.makedirs(os.path.dirname(fn))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
newfile = open(fn, "w")
newfile.write(data)
newfile.close()
class PARSER(PARSEOBJECT):
_ids = count(0)
_passes = count(0)
def __init__(self):
self.id = next(self._ids)
self.tupline = []
self.tupfile = []
self.passes = next(self._passes)