-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy pathclean_gadget.py
126 lines (110 loc) · 7.34 KB
/
clean_gadget.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
# keywords up to C11 and C++17; immutable set
keywords = frozenset({'__asm', '__builtin', '__cdecl', '__declspec', '__except', '__export', '__far16', '__far32',
'__fastcall', '__finally', '__import', '__inline', '__int16', '__int32', '__int64', '__int8',
'__leave', '__optlink', '__packed', '__pascal', '__stdcall', '__system', '__thread', '__try',
'__unaligned', '_asm', '_Builtin', '_Cdecl', '_declspec', '_except', '_Export', '_Far16',
'_Far32', '_Fastcall', '_finally', '_Import', '_inline', '_int16', '_int32', '_int64',
'_int8', '_leave', '_Optlink', '_Packed', '_Pascal', '_stdcall', '_System', '_try', 'alignas',
'alignof', 'and', 'and_eq', 'asm', 'auto', 'bitand', 'bitor', 'bool', 'break', 'case',
'catch', 'char', 'char16_t', 'char32_t', 'class', 'compl', 'const', 'const_cast', 'constexpr',
'continue', 'decltype', 'default', 'delete', 'do', 'double', 'dynamic_cast', 'else', 'enum',
'explicit', 'export', 'extern', 'false', 'final', 'float', 'for', 'friend', 'goto', 'if',
'inline', 'int', 'long', 'mutable', 'namespace', 'new', 'noexcept', 'not', 'not_eq', 'nullptr',
'operator', 'or', 'or_eq', 'override', 'private', 'protected', 'public', 'register',
'reinterpret_cast', 'return', 'short', 'signed', 'sizeof', 'static', 'static_assert',
'static_cast', 'struct', 'switch', 'template', 'this', 'thread_local', 'throw', 'true', 'try',
'typedef', 'typeid', 'typename', 'union', 'unsigned', 'using', 'virtual', 'void', 'volatile',
'wchar_t', 'while', 'xor', 'xor_eq', 'NULL'})
# holds known non-user-defined functions; immutable set
main_set = frozenset({'main'})
# arguments in main function; immutable set
main_args = frozenset({'argc', 'argv'})
# input is a list of string lines
def clean_gadget(gadget):
# dictionary; map function name to symbol name + number
fun_symbols = {}
# dictionary; map variable name to symbol name + number
var_symbols = {}
fun_count = 1
var_count = 1
# regular expression to catch multi-line comment
rx_comment = re.compile('\*/\s*$')
# regular expression to find function name candidates
rx_fun = re.compile(r'\b([_A-Za-z]\w*)\b(?=\s*\()')
# regular expression to find variable name candidates
#rx_var = re.compile(r'\b([_A-Za-z]\w*)\b(?!\s*\()')
rx_var = re.compile(r'\b([_A-Za-z]\w*)\b(?:(?=\s*\w+\()|(?!\s*\w+))(?!\s*\()')
# final cleaned gadget output to return to interface
cleaned_gadget = []
for line in gadget:
# process if not the header line and not a multi-line commented line
if rx_comment.search(line) is None:
# remove all string literals (keep the quotes)
nostrlit_line = re.sub(r'".*?"', '""', line)
# remove all character literals
nocharlit_line = re.sub(r"'.*?'", "''", nostrlit_line)
# replace any non-ASCII characters with empty string
ascii_line = re.sub(r'[^\x00-\x7f]', r'', nocharlit_line)
# return, in order, all regex matches at string list; preserves order for semantics
user_fun = rx_fun.findall(ascii_line)
user_var = rx_var.findall(ascii_line)
# Could easily make a "clean gadget" type class to prevent duplicate functionality
# of creating/comparing symbol names for functions and variables in much the same way.
# The comparison frozenset, symbol dictionaries, and counters would be class scope.
# So would only need to pass a string list and a string literal for symbol names to
# another function.
for fun_name in user_fun:
if len({fun_name}.difference(main_set)) != 0 and len({fun_name}.difference(keywords)) != 0:
# DEBUG
#print('comparing ' + str(fun_name + ' to ' + str(main_set)))
#print(fun_name + ' diff len from main is ' + str(len({fun_name}.difference(main_set))))
#print('comparing ' + str(fun_name + ' to ' + str(keywords)))
#print(fun_name + ' diff len from keywords is ' + str(len({fun_name}.difference(keywords))))
###
# check to see if function name already in dictionary
if fun_name not in fun_symbols.keys():
fun_symbols[fun_name] = 'FUN' + str(fun_count)
fun_count += 1
# ensure that only function name gets replaced (no variable name with same
# identifier); uses positive lookforward
ascii_line = re.sub(r'\b(' + fun_name + r')\b(?=\s*\()', fun_symbols[fun_name], ascii_line)
for var_name in user_var:
# next line is the nuanced difference between fun_name and var_name
if len({var_name}.difference(keywords)) != 0 and len({var_name}.difference(main_args)) != 0:
# DEBUG
#print('comparing ' + str(var_name + ' to ' + str(keywords)))
#print(var_name + ' diff len from keywords is ' + str(len({var_name}.difference(keywords))))
#print('comparing ' + str(var_name + ' to ' + str(main_args)))
#print(var_name + ' diff len from main args is ' + str(len({var_name}.difference(main_args))))
###
# check to see if variable name already in dictionary
if var_name not in var_symbols.keys():
var_symbols[var_name] = 'VAR' + str(var_count)
var_count += 1
# ensure that only variable name gets replaced (no function name with same
# identifier); uses negative lookforward
ascii_line = re.sub(r'\b(' + var_name + r')\b(?:(?=\s*\w+\()|(?!\s*\w+))(?!\s*\()', \
var_symbols[var_name], ascii_line)
cleaned_gadget.append(ascii_line)
# return the list of cleaned lines
return cleaned_gadget
if __name__ == '__main__':
test_gadget = ['231 151712/shm_setup.c inputfunc 11',
'int main(int argc, char **argv) {',
'while ((c = getopt(argc, argv, "k:s:m:o:h")) != -1) {',
'switch(c) {']
test_gadget2 = ['278 151587/ffmpeg.c inputfunc 3159', 'int main(int argc,char **argv)',
'parse_loglevel(argc,argv,options);', 'if (argc > 1 && !strcmp(argv[1],"-d")) {',
'argc--;', 'argv++;', 'show_banner(argc,argv,options);',
'ret = ffmpeg_parse_options(argc,argv);', 'if (ret < 0) {']
test_gadget3 = ['invalid_memory_access_012_s_001 *s;',
's = (invalid_memory_access_012_s_001 *)calloc(1,sizeof(invalid_memory_access_012_s_001));',
's->a = 20;', 's->b = 20;', 's->uninit = 20;', 'free(s);]']
test_gadgetline = ['function(File file, Buffer buff)', 'this is a comment test */']
split_test = 'printf ( " " , variable ++ )'.split()
print(clean_gadget(test_gadget))
print(clean_gadget(test_gadget2))
print(clean_gadget(test_gadget3))
print(clean_gadget(test_gadgetline))
print(split_test)