-
-
Notifications
You must be signed in to change notification settings - Fork 552
/
analysis.py
424 lines (358 loc) · 12.4 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import io
import json
import os
import re
import unicodedata
import chardet
import typecode
from textcode import pdf
from textcode import markup
from textcode import sfdb
from textcode import strings
"""
Utilities to analyze text. Files are the input.
Once a file is read its output are unicode text lines.
All internal processing assumes unicode in and out.
"""
# Tracing flags
TRACE = False or os.environ.get('SCANCODE_DEBUG_TEXT_ANALYSIS', False)
# Tracing flags
def logger_debug(*args):
pass
if TRACE:
import logging
import sys
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
def numbered_text_lines(
location,
demarkup=False,
plain_text=False,
start_line=1,
):
"""
Yield tuples of (line number, text line) from the file at `location`. Return
an empty iterator if no text content is extractible. Text extraction is
based on detected file type. Long lines are broken down in chunks, therefore
two items can have the same line number.
line numbers start at ``start_line`` which is 1-based by default.
If `demarkup` is True, attempt to detect if a file contains HTML/XML-like
markup and cleanup this markup.
If `plain_text` is True treat the file as a plain text file and do not
attempt to detect its type and extract its content with special procedures.
This is used mostly when loading license texts and rules.
Note: For testing or building from strings, location can be a is a list of
unicode line strings.
"""
if not location:
return iter([])
if not isinstance(location, str):
# not a path: wrap an iterator on location which should be a sequence
# of lines
if TRACE:
logger_debug('numbered_text_lines:', 'location is not a file')
return enumerate(iter(location), start_line)
if plain_text:
if TRACE:
logger_debug('numbered_text_lines:', 'plain_text')
return enumerate(unicode_text_lines(location), start_line)
T = typecode.get_type(location)
if TRACE:
logger_debug('numbered_text_lines: T.filetype_file:', T.filetype_file)
logger_debug('numbered_text_lines: T.is_text_with_long_lines:', T.is_text_with_long_lines)
logger_debug('numbered_text_lines: T.is_binary:', T.is_binary)
# TODO: we should have a command line to force digging inside binaries
if not T.contains_text:
return iter([])
# Should we read this as some markup, pdf office doc, text or binary?
if T.is_pdf and T.is_pdf_with_text:
if TRACE:
logger_debug('numbered_text_lines:', 'is_pdf')
return enumerate(unicode_text_lines_from_pdf(location), start_line)
if T.filetype_file.startswith('Spline Font Database'):
if TRACE:
logger_debug('numbered_text_lines:', 'Spline Font Database')
return enumerate(
(as_unicode(l) for l in sfdb.get_text_lines(location)),
start_line,
)
# lightweight markup stripping support
if demarkup and markup.is_markup(location):
try:
numbered_lines = list(enumerate(markup.demarkup(location), start_line))
if TRACE:
logger_debug('numbered_text_lines:', 'demarkup')
numbered_lines = break_numbered_unicode_text_lines(numbered_lines)
if TRACE:
logger_debug('numbered_text_lines demarkup:', 'break_numbered_unicode_text_lines')
return numbered_lines
except:
# try again later with as plain text
pass
if T.is_js_map:
try:
numbered_lines = list(enumerate(js_map_sources_lines(location), start_line))
if TRACE:
logger_debug('numbered_text_lines:', 'js_map')
return numbered_lines
except:
# try again later with as plain text otherwise
pass
if T.is_text:
lines = unicode_text_lines(location=location, decrlf=is_source(location))
numbered_lines = enumerate(lines, start_line)
# text with very long lines such minified JS, JS map files or large JSON
if (
not location.endswith('package.json')
and (
T.is_text_with_long_lines or T.is_compact_js
or T.filetype_file == 'data' or 'locale' in location
)
):
numbered_lines = break_numbered_unicode_text_lines(numbered_lines)
if TRACE:
logger_debug('numbered_text_lines:', 'break_numbered_unicode_text_lines')
return numbered_lines
# TODO: handle Office-like documents, RTF, etc
# if T.is_doc:
# return unicode_text_lines_from_doc(location)
# TODO: add support for "wide" UTF-16-like strings where each char is
# followed by a zero as is often found in some Windows binaries. Do this for
# binaries only. This is may conflicting with "strings" extraction as
# currently implemented
if T.is_binary:
# fall back to binary
if TRACE:
logger_debug('numbered_text_lines:', 'is_binary')
return enumerate(unicode_text_lines_from_binary(location), start_line)
return iter([])
def unicode_text_lines_from_binary(location):
"""
Return an iterable over unicode text lines extracted from a binary file at
location.
"""
T = typecode.get_type(location)
if T.contains_text:
for line in strings.strings_from_file(location):
yield remove_verbatim_cr_lf_tab_chars(line)
def unicode_text_lines_from_pdf(location):
"""
Return an iterable over unicode text lines extracted from a pdf file at
location.
"""
for line in pdf.get_text_lines(location):
yield as_unicode(line)
def break_numbered_unicode_text_lines(
numbered_lines,
split=u'([",\'])',
max_len=200,
chunk_len=30,
):
"""
Yield text lines breaking long lines on `split` where numbered_lines is an
iterator of (line number, line text).
"""
splitter = re.compile(split).split
for line_number, line in numbered_lines:
if len(line) > max_len:
# spli then reassemble in more reasonable chunks
splitted = splitter(line)
chunks = (splitted[i:i + chunk_len] for i in range(0, len(splitted), chunk_len))
for chunk in chunks:
full_chunk = u''.join(chunk)
if full_chunk:
yield line_number, full_chunk
else:
yield line_number, line
def js_map_sources_lines(location):
"""
Yield unicode text lines from the js.map or css.map file at `location`.
Spec is at:
https://docs.google.com/document/d/1U1RGAehQwRypUTovF1KRlpiOFze0b-_2gc6fAH0KY0k/edit
The format is:
{
"version" : 3,
"file": "out.js",
"sourceRoot": "",
"sources": ["foo.js", "bar.js"],
"sourcesContent": [null, null],
"names": ["src", "maps", "are", "fun"],
"mappings": "A,AAAB;;ABCDE;"
}
We care only about the presence of these tags for detection: version, sources, sourcesContent.
"""
with io.open(location, encoding='utf-8') as jsm:
content = json.load(jsm)
sources = content.get('sourcesContent', [])
for entry in sources:
entry = replace_verbatim_cr_lf_chars(entry)
for line in entry.splitlines():
l = remove_verbatim_cr_lf_tab_chars(line)
yield l
def as_unicode(line):
"""
Return a unicode text line from a text line.
Try to decode line as Unicode. Try first some default encodings,
then attempt Unicode trans-literation and finally
fall-back to ASCII strings extraction.
TODO: Add file/magic detection, unicodedmanit/BS3/4
"""
if isinstance(line, str):
return remove_null_bytes(line)
try:
s = line.decode('UTF-8')
except UnicodeDecodeError:
try:
# FIXME: latin-1 may never fail
s = line.decode('LATIN-1')
except UnicodeDecodeError:
try:
# Convert some byte string to ASCII characters as Unicode including
# replacing accented characters with their non- accented NFKD
# equivalent. Non ISO-Latin and non ASCII characters are stripped
# from the output. Does not preserve the original length offsets.
# For Unicode NFKD equivalence, see:
# http://en.wikipedia.org/wiki/Unicode_equivalence
s = unicodedata.normalize('NFKD', line).encode('ASCII')
except UnicodeDecodeError:
try:
enc = chardet.detect(line)['encoding']
s = str(line, enc)
except UnicodeDecodeError:
# fall-back to strings extraction if all else fails
s = strings.string_from_string(s)
return remove_null_bytes(s)
def remove_null_bytes(s):
"""
Return a string replacing by a space all null bytes.
There are some rare cases where we can have binary strings that are not
caught early when detecting a file type, but only late at the line level.
This help catch most of these cases.
"""
return s.replace('\x00', ' ')
def remove_verbatim_cr_lf_tab_chars(s):
"""
Return a string replacing by a space any verbatim but escaped line endings
and tabs (such as a literal \n or \r \t).
"""
return s.replace('\\r', ' ').replace('\\n', ' ').replace('\\t', ' ')
def replace_verbatim_cr_lf_chars(s):
"""
Return a string replacing by a LF any verbatim but escaped line endings
and tabs (such as a literal \n or \r.
"""
return (s
.replace('\\\\r\\\\n', '\n')
.replace('\\r\\n', '\n')
.replace('\\\\r', '\n')
.replace('\\\\n', '\n')
.replace('\\r', '\n')
.replace('\\n', '\n')
)
def unicode_text_lines(location, decrlf=False):
"""
Yield unicode text lines from a file at ``location`` if it
contains text.
Open the file as binary then try to decode each line as Unicode.
Remove verbatim, escaped CR, LF and tabs if ``decrlf`` is True.
"""
lines = _unicode_text_lines(location)
if decrlf:
return map(remove_verbatim_cr_lf_tab_chars, lines)
else:
return lines
def _unicode_text_lines(location):
with open(location, 'rb') as f:
for line in f.read().splitlines(True):
yield as_unicode(line)
def unicode_text(location, decrlf=False):
"""
Return a string guaranteed to be unicode from the content of the file at
location. The whole file content is returned at once, which may be a
problem for very large files.
"""
return u' '.join(unicode_text_lines(location, decrlf=decrlf))
def is_source(location):
"""
Return True if the file at location is source code, based on its file
extension
"""
return location.endswith((
'.ada',
'.adb',
'.asm',
'.asp',
'.aj',
'.bas',
'.bat',
'.c',
'.c++',
'.cc',
'.clj',
'.cob',
'.cpp',
'.cs',
'.csh',
'.csx',
'.cxx',
'.d',
'.e',
'.el',
'.f',
'.fs',
'.f77',
'.f90',
'.for',
'.fth',
'.ftn',
'.go',
'.h',
'.hh',
'.hpp',
'.hs',
'.html',
'.htm',
'.hxx',
'.java',
'.js',
'.jsx',
'.jsp',
'.ksh',
'.kt',
'.lisp',
'.lua',
'.m',
'.m4',
'.nim',
'.pas',
'.php',
'.pl',
'.pp',
'.ps1',
'.py',
'.r',
'.rb',
'.ruby',
'.rs',
'.s',
'.scala',
'.sh',
'.swift',
'.ts',
'.vhdl',
'.verilog',
'.vb',
'.groovy',
'.po',
))