forked from Freely-Given-org/BibleOrgSys
-
Notifications
You must be signed in to change notification settings - Fork 0
/
USFMFilenames.py
executable file
·634 lines (566 loc) · 38.6 KB
/
USFMFilenames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# USFMFilenames.py
#
# Module handling USFM Bible filenames
#
# Copyright (C) 2010-2018 Robert Hunt
# Author: Robert Hunt <[email protected]>
# License: See gpl-3.0.txt
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Module for creating and manipulating USFM filenames.
"""
from gettext import gettext as _
LastModifiedDate = '2018-01-18' # by RJH
ShortProgName = "USFMFilenames"
ProgName = "USFM Bible filenames handler"
ProgVersion = '0.68'
ProgNameVersion = '{} v{}'.format( ShortProgName, ProgVersion )
ProgNameVersionDate = '{} {} {}'.format( ProgNameVersion, _("last modified"), LastModifiedDate )
debuggingThisModule = False
import os, logging
import BibleOrgSysGlobals
def exp( messageString ):
"""
Expands the message string in debug mode.
Prepends the module name to a error or warning message string
if we are in debug mode.
Returns the new string.
"""
try: nameBit, errorBit = messageString.split( ': ', 1 )
except ValueError: nameBit, errorBit = '', messageString
if BibleOrgSysGlobals.debugFlag or debuggingThisModule:
nameBit = '{}{}{}'.format( ShortProgName, '.' if nameBit else '', nameBit )
return '{}{}'.format( nameBit+': ' if nameBit else '', errorBit )
# end of exp
# The filenames produced by the Bibledit program seem to have a .usfm extension (Info below is from gtk/src/bookdata.cpp 2012-07-11)
BIBLEDIT_FILENAMES = ( '1_Genesis', '2_Exodus', '3_Leviticus', '4_Numbers', '5_Deuteronomy', '6_Joshua', '7_Judges', '8_Ruth', '9_1_Samuel', '10_2_Samuel',
'11_1_Kings', '12_2_Kings', '13_1_Chronicles', '14_2_Chronicles', '15_Ezra', '16_Nehemiah', '17_Esther', '18_Job', '19_Psalms', '20_Proverbs', '21_Ecclesiastes',
'22_Song_of_Solomon', '23_Isaiah', '24_Jeremiah', '25_Lamentations', '26_Ezekiel', '27_Daniel', '28_Hosea', '29_Joel', '30_Amos', '31_Obadiah', '32_Jonah',
'33_Micah', '34_Nahum', '35_Habakkuk', '36_Zephaniah', '37_Haggai', '38_Zechariah', '39_Malachi',
'40_Matthew', '41_Mark', '42_Luke', '43_John', '44_Acts', '45_Romans', '46_1_Corinthians', '47_2_Corinthians', '48_Galatians', '49_Ephesians', '50_Philippians',
'51_Colossians', '52_1_Thessalonians', '53_2_Thessalonians', '54_1_Timothy', '55_2_Timothy', '56_Titus', '57_Philemon',
'58_Hebrews', '59_James', '60_1_Peter', '61_2_Peter', '62_1_John', '63_2_John', '64_3_John', '65_Jude', '66_Revelation',
'67_Front_Matter', '68_Back_Matter', '69_Other_Material', '70_Tobit', '71_Judith', '72_Esther_(Greek)', '73_Wisdom_of_Solomon', '74_Sirach', '75_Baruch',
'76_Letter_of_Jeremiah', '77_Song_of_the_Three_Children', '78_Susanna', '79_Bel_and_the_Dragon', '80_1_Maccabees', '81_2_Maccabees',
'82_1_Esdras', '83_Prayer_of_Manasses', '84_Psalm_151', '85_3_Maccabees', '86_2_Esdras', '87_4_Maccabees', '88_Daniel_(Greek)' )
ALTERNATE_FILENAMES = ( '01-Genesis', '02-Exodus', '03-Leviticus', '04-Numbers', '05-Deuteronomy', '06-Joshua', '07-Judges', '08-Ruth', '09-1 Samuel', '10-2 Samuel',
'11-1 Kings', '12-2 Kings', '13-1 Chronicles', '14-2 Chronicles', '15-Ezra', '16-Nehemiah', '17-Esther', '18-Job', '19-Psalms', '20-Proverbs', '21-Ecclesiastes',
'22-Song-of-Solomon', '23-Isaiah', '24-Jeremiah', '25-Lamentations', '26-Ezekiel', '27-Daniel', '28-Hosea', '29-Joel', '30-Amos', '31-Obadiah', '32-Jonah',
'33-Micah', '34-Nahum', '35-Habakkuk', '36-Zephaniah', '37-Haggai', '38-Zechariah', '39-Malachi',
'40-Matthew', '41-Mark', '42-Luke', '43-John', '44-Acts', '45-Romans', '46-1 Corinthians', '47-2 Corinthians', '48-Galatians', '49-Ephesians', '50-Philippians',
'51-Colossians', '52-1 Thessalonians', '53-2 Thessalonians', '54-1 Timothy', '55-2 Timothy', '56-Titus', '57-Philemon',
'58-Hebrews', '59-James', '60-1 Peter', '61-2 Peter', '62-1 John', '63-2 John', '64-3 John', '65-Jude', '66-Revelation',
#'67-Front-Matter', '68-Back-Matter', '69-Other-Material', '70-Tobit', '71-Judith', '72-Esther-(Greek)', '73-Wisdom-of-Solomon', '74-Sirach', '75-Baruch',
#'76-Letter-of-Jeremiah', '77-Song-of-the-Three-Children', '78-Susanna', '79-Bel-and-the-Dragon', '80-1 Maccabees', '81-2 Maccabees',
#'82-1 Esdras', '83-Prayer-of-Manasses', '84-Psalm-151', '85 3-Maccabees', '86-2 Esdras', '87-4 Maccabees', '88-Daniel-(Greek)'
)
# All of the following must be all UPPER CASE
FILENAMES_TO_IGNORE = ('AUTOCORRECT.TXT','HYPHENATEDWORDS.TXT','PRINTDRAFTCHANGES.TXT','README.TXT','BOOK_NAMES.TXT',) # Only needs to include names whose extensions are not listed below
FILENAME_ENDINGS_TO_IGNORE = ('.ZIP.GO', '.ZIP.DATA',) # Must begin with a dot
# NOTE: Extensions ending in ~ are also ignored
EXTENSIONS_TO_IGNORE = ( 'ASC', 'BAK', 'BAK2', 'BAK3', 'BAK4', 'BBLX', 'BC', 'CCT', 'CSS', 'DOC', 'DTS', 'HTM','HTML',
'JAR', 'LDS', 'LOG', 'MYBIBLE', 'NT','NTX', 'ODT', 'ONT','ONTX', 'OSIS', 'OT','OTX', 'PDB',
'SAV', 'SAVE', 'STY', 'SSF', 'USX', 'VRS', 'YET', 'XML', 'ZIP', ) # Must be UPPERCASE and NOT begin with a dot
class USFMFilenames:
"""
Class for creating and manipulating USFM filenames.
Always returns lists of USFM filenames in the default rough sequence order from the BibleBooksCodes module.
"""
def __init__( self, givenFolderName ):
"""
Create the object by inspecting files in the given folder.
Creates a self.pattern (Paratext template) for USFM filenames where
nnn = language code (lower case) or NNN = language code (UPPER CASE)
bbb = book code (lower case) or BBB = book code (UPPER CASE)
dd = digits
"""
if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( "USFMFilenames( {} )".format( givenFolderName ) )
self.givenFolderName = givenFolderName
self.pattern, self.fileExtension = '', ''
self.fileList = [] # A list of all files in our folder (excluding folder names and backup filenames)
self._fileDictionary = {} # The keys are 2-tuples of folder, filename, the values are all valid BBB values
self._BBBDictionary = {} # The keys are valid BBB values, the values are all 2-tuples of folder, filename
# Check that the given folder is readable
if not os.access( self.givenFolderName, os.R_OK ):
logging.critical( _("USFMFilenames: Given {!r} folder is unreadable").format( self.givenFolderName ) )
return
# Get the data tables that we need for proper checking
self._USFMBooksCodes = BibleOrgSysGlobals.BibleBooksCodes.getAllUSFMBooksCodes()
self._USFMBooksCodesUpper = [x.upper() for x in self._USFMBooksCodes]
self._USFMBooksCodeNumberTriples = BibleOrgSysGlobals.BibleBooksCodes.getAllUSFMBooksCodeNumberTriples()
self._BibleditBooksCodeNumberTriples = BibleOrgSysGlobals.BibleBooksCodes.getAllBibleditBooksCodeNumberTriples()
# Find how many files are in our folder
self.lastTupleList = None
for possibleFilename in os.listdir( self.givenFolderName ):
pFUpper = possibleFilename.upper()
if pFUpper in FILENAMES_TO_IGNORE: continue
pFUpperProper, pFUpperExt = os.path.splitext( pFUpper )
ignore = False
for ending in FILENAME_ENDINGS_TO_IGNORE:
if pFUpper.endswith( ending): ignore=True; break
if ignore: continue
if pFUpper[-1]!='~' and not pFUpperExt[1:] in EXTENSIONS_TO_IGNORE: # Compare without the first dot
filepath = os.path.join( self.givenFolderName, possibleFilename )
if os.path.isfile( filepath ): # It's a file not a folder
self.fileList.append( possibleFilename )
#print( "fL", self.fileList )
#if not self.fileList: logging.error( _("No files at all in given folder: {!r}").format( self.givenFolderName) ); return
# See if we can find a pattern for these filenames
matched = False
for foundFilename in self.fileList:
foundFileBit, foundExtBit = os.path.splitext( foundFilename )
foundLength = len( foundFileBit )
#print( foundFileBit, foundExtBit )
matched = False
if '_' in foundFileBit and foundExtBit and foundExtBit[0]=='.': # Check for possible Bibledit filenames first
for USFMBookCode,BibleditDigits,BBB in self._BibleditBooksCodeNumberTriples:
BibleditSignature = BibleditDigits + '_'
if BibleditSignature in foundFileBit and foundFileBit in BIBLEDIT_FILENAMES and foundExtBit == '.usfm':
digitsIndex = foundFileBit.index( BibleditSignature )
if digitsIndex == 0:
self.languageIndex = None
self.languageCode = None
self.digitsIndex = digitsIndex
self.USFMBookCodeIndex = None
self.pattern = "Dd_BEName"
self.fileExtension = foundExtBit[1:]
matched = True
break
elif '-' in foundFileBit and foundExtBit and foundExtBit[0]=='.': # Check for possible Bibledit filenames first
for USFMBookCode,BibleditDigits,BBB in self._BibleditBooksCodeNumberTriples:
if foundFileBit in ALTERNATE_FILENAMES and foundExtBit == '.usfm':
if foundFilename[0:2].isdigit:
self.languageIndex = None
self.languageCode = None
self.digitsIndex = 0
self.USFMBookCodeIndex = None
self.pattern = "dd-OEBName"
self.fileExtension = foundExtBit[1:]
matched = True
break
if matched: break
if matched: break
# Didn't find a Bibledit filename -- maybe it's a Paratext style or some kind of freestyle
containsDigits = False
for char in foundFileBit:
if char.isdigit():
containsDigits = True
break
if containsDigits and foundExtBit and foundExtBit[0]=='.':
for USFMBookCode,USFMDigits,BBB in self._USFMBooksCodeNumberTriples:
if USFMDigits in foundFileBit and (USFMBookCode in foundFileBit or USFMBookCode.upper() in foundFileBit):
digitsIndex = foundFileBit.index( USFMDigits )
USFMBookCodeIndex = foundFileBit.index(USFMBookCode) if USFMBookCode in foundFileBit else foundFileBit.index(USFMBookCode.upper())
USFMBookCode = foundFileBit[USFMBookCodeIndex:USFMBookCodeIndex+3]
#print( digitsIndex, USFMBookCodeIndex, USFMBookCode )
if foundLength>=8 and digitsIndex==0 and USFMBookCodeIndex==2: # Found a form like 01GENlanguage.xyz
self.languageIndex = 5
self.languageCode = foundFileBit[self.languageIndex:self.languageIndex+foundLength-5]
self.digitsIndex = digitsIndex
self.USFMBookCodeIndex = USFMBookCodeIndex
self.pattern = "ddbbb" + 'l'*(foundLength-5)
matched = True
elif foundLength==8 and digitsIndex==3 and USFMBookCodeIndex==5: # Found a form like lng01GEN.xyz
self.languageIndex = 0
self.languageCode = foundFileBit[self.languageIndex:self.languageIndex+foundLength-5]
self.digitsIndex = digitsIndex
self.USFMBookCodeIndex = USFMBookCodeIndex
self.pattern = "lllddbbb"
matched = True
else: # we'll try to be more generic
self.languageIndex = None
self.languageCode = None
self.digitsIndex = digitsIndex
self.USFMBookCodeIndex = USFMBookCodeIndex
self.pattern = '*' * foundLength
self.pattern = self.pattern[:digitsIndex] + 'dd' + self.pattern[digitsIndex+2:]
self.pattern = self.pattern[:USFMBookCodeIndex] + 'bbb' + self.pattern[USFMBookCodeIndex+3:]
fillerSize = self.pattern.count( '*' )
fillerIndex = self.pattern.find( '*' )
if fillerIndex!=-1 and fillerSize==1: self.pattern = self.pattern[:fillerIndex] + foundFilename[fillerIndex] + self.pattern[fillerIndex+1:]
if BibleOrgSysGlobals.verbosityLevel > 2: print( "Pattern is {!r}".format( self.pattern ) )
if '*' not in self.pattern: matched = True
else: # we'll try to be even more generic
self.languageIndex = self.digitsIndex = None
self.languageCode = None
self.USFMBookCodeIndex = USFMBookCodeIndex
self.pattern = '*' * foundLength
self.pattern = self.pattern[:USFMBookCodeIndex] + 'bbb' + self.pattern[USFMBookCodeIndex+3:]
if BibleOrgSysGlobals.verbosityLevel > 2: print( "More generic pattern is {!r}".format( self.pattern ) )
matched = True
if matched:
if self.languageCode and self.languageCode.isupper(): self.pattern = self.pattern.replace( 'l', 'L' )
if USFMBookCode.isupper(): self.pattern = self.pattern.replace( 'bbb', 'BBB' )
self.fileExtension = foundExtBit[1:]
break
if matched: break
if matched: break
#if not matched: logging.info( _("Unable to recognize pattern of valid USFM files in ") + self.givenFolderName )
#print( "USFMFilenames: pattern={!r} fileExtension={!r}".format( self.pattern, self.fileExtension ) )
# Also, try looking inside the files
self.getUSFMIDsFromFiles( self.givenFolderName ) # Fill the above dictionaries
#print( "fD", self._fileDictionary )
# end of __init__
def __str__( self ):
"""
This method returns the string representation of an object.
@return: the name of a Bible object formatted as a string
@rtype: string
"""
result = "USFM Filenames object:"
indent = 2
if self.givenFolderName: result += ('\n' if result else '') + ' '*indent + _("Folder: {}").format( self.givenFolderName )
if self.pattern: result += ('\n' if result else '') + ' '*indent + _("Filename pattern: {}").format( self.pattern )
if self.fileExtension: result += ('\n' if result else '') + ' '*indent + _("File extension: {}").format( self.fileExtension )
if self.fileList and BibleOrgSysGlobals.verbosityLevel > 2: result += ('\n' if result else '') + ' '*indent + _("File list: ({}) {}").format( len(self.fileList), self.fileList )
return result
# end of __str___
def __len__( self ):
"""
This method returns the last number of files found.
@return: None (if no search done) or else the last number of USFM files found
@rtype: int
"""
if self.lastTupleList is None: return 0
return len( self.lastTupleList )
# end of __len___
def getUSFMIDFromFile( self, folder, thisFilename, filepath, encoding=None ):
"""
Try to intelligently get the USFMId from the first line in the file (which should be the \\id line).
"""
#print( exp("getUSFMIDFromFile( {} {} {} {} )").format( repr(folder), repr(thisFilename), repr(filepath), encoding ) )
if encoding is None: encoding = 'utf-8'
# Look for the USFM id in the ID line (which should be the first line in a USFM file)
try:
with open( filepath, 'rt', encoding=encoding ) as possibleUSFMFile: # Automatically closes the file when done
lineNumber = 0
for line in possibleUSFMFile:
lineNumber += 1
if line[-1]=='\n': line = line[:-1] # Removing trailing newline character
#print( thisFilename, lineNumber, line )
if line.startswith( '\\id ' ):
if len(line)<5: logging.warning( "id line {!r} in {} is too short".format( line, filepath ) )
idContent = line[4:]
tokens = idContent.split()
#print( "Have id tokens: {}".format( tokens ) )
UCToken0 = tokens[0].upper()
if UCToken0=='I': UCToken0 = '1'
if UCToken0=='II': UCToken0 = '2'
if UCToken0=='III': UCToken0 = '3'
if UCToken0=='IV': UCToken0 = '4'
if UCToken0=='V': UCToken0 = '5'
if UCToken0 in ('1','2','3','4','5',) and len(tokens)>=2: UCToken0 += tokens[1].upper() # Combine something like 1 Sa to 1SA
if UCToken0.startswith( 'JUDG' ): UCToken0 = UCToken0[0] + UCToken0[2:] # Remove the U because it gets confused with JUDE
if len(UCToken0)>2 and UCToken0[1] in ('_','-'): UCToken0 = UCToken0[0] + UCToken0[2:] # Change something like 1_SA to 1SA
if UCToken0 in self._USFMBooksCodesUpper: return UCToken0 # it's a valid one -- we have the most confidence in this one
elif UCToken0[:3] in self._USFMBooksCodesUpper: return UCToken0[:3] # perhaps an abbreviated version is valid (but could think Judges is JUD=Jude)
else: print( "But {!r} wasn't a valid USFM ID in {}!!!".format( UCToken0, thisFilename ) )
break
elif lineNumber == 1:
if line.startswith ( '\\' ):
logging.warning( "First line in {} in {} starts with a backslash but not an id line {!r}".format( thisFilename, folder, line ) )
elif not line:
logging.info( "First line in {} in {} appears to be blank".format( thisFilename, folder ) )
if lineNumber >= 2: break # We only look at the first one or two lines
except UnicodeDecodeError:
if thisFilename != 'usfm-color.sty': # Seems this file isn't UTF-8, but we don't need it here anyway so ignore it
logging.warning( exp("getUSFMIDFromFile: Seems we couldn't decode Unicode in {!r}").format( filepath ) ) # Could be binary or a different encoding
return None
# end of getUSFMIDFromFile
def getUSFMIDsFromFiles( self, givenFolder ):
"""
Go through all the files in the given folder and see how many USFM IDs we can find.
Populates the two dictionaries.
Returns the number of files found.
"""
if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( exp("getUSFMIDsFromFiles( {} )").format( repr(givenFolder) ) )
# Empty the two dictionaries
self._fileDictionary = {} # The keys are 2-tuples of folder, filename, the values are all valid BBB values
self._BBBDictionary = {} # The keys are valid BBB values, the values are all 2-tuples of folder, filename
folderFilenames = os.listdir( givenFolder )
for possibleFilename in folderFilenames:
pFUpper = possibleFilename.upper()
if pFUpper in FILENAMES_TO_IGNORE: continue
pFUpperProper, pFUpperExt = os.path.splitext( pFUpper )
ignore = False
for ending in FILENAME_ENDINGS_TO_IGNORE:
if pFUpper.endswith( ending): ignore=True; break
if ignore: continue
if pFUpper[-1]!='~' and not pFUpperExt[1:] in EXTENSIONS_TO_IGNORE: # Compare without the first dot
filepath = os.path.join( givenFolder, possibleFilename )
if os.path.isfile( filepath ): # It's a file not a folder
USFMId = self.getUSFMIDFromFile( givenFolder, possibleFilename, filepath )
if USFMId:
assert filepath not in self._fileDictionary
BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFMAbbreviation( USFMId )
self._fileDictionary[(givenFolder,possibleFilename,)] = BBB
if BBB in self._BBBDictionary: logging.error( "{}Oops, already found {!r} in {}, now we have a duplicate in {}".format( 'getUSFMIDsFromFiles: ' if BibleOrgSysGlobals.debugFlag else '', BBB, self._BBBDictionary[BBB], possibleFilename ) )
self._BBBDictionary[BBB] = (givenFolder,possibleFilename,)
if len(self._fileDictionary) != len(self._BBBDictionary):
logging.warning( "getUSFMIDsFromFiles: Oops, something went wrong because dictionaries have {} and {} entries".format( len(self._fileDictionary), len(self._BBBDictionary) ) )
#print( "fD2", self._fileDictionary )
return len(self._fileDictionary)
# end of getUSFMIDsFromFiles
def getFilenameTemplate( self ):
"""
Returns a pattern/template for USFM filenames where
lll = language code (lower case) or LLL = language code (UPPER CASE)
bbb = book code (lower case) or BBB = book code (UPPER CASE)
dd = Paratext digits (can actually include some letters)
"""
return self.pattern
# end of getFilenameTemplate
def getAllFilenames( self ):
"""
Return a list of all filenames in our folder.
This excludes names of subfolders and backup files.
"""
return self.fileList
# end of getAllFilenames
def doListAppend( self, BBB, filename, givenList, caller ):
"""
Check that BBB and filename are not in the givenList,
then add them as a 2-tuple.
If there is a duplicate, remove both (as we're obviously unsure).
"""
removeBBB = removeFilename = None
for existingBBB, existingFilename in givenList:
if existingBBB == BBB:
if BibleOrgSysGlobals.verbosityLevel > 2: logging.warning( "{} tried to add duplicate {} {} when already had {} (removed both)".format( caller, BBB, filename, existingFilename ) )
removeBBB, removeFilename = existingBBB, existingFilename
if existingFilename == filename:
if BibleOrgSysGlobals.verbosityLevel > 2: logging.warning( "{} tried to add duplicate {} {} when already had {} (removed both)".format( caller, filename, BBB, existingBBB ) )
removeBBB, removeFilename = existingBBB, existingFilename
if removeFilename:givenList.remove( (removeBBB,removeFilename,) )
else: givenList.append( (BBB,filename,) )
# end of doListAppend
def getDerivedFilenameTuples( self ):
"""
Return a theoretical list of valid USFM filenames that match our filename template.
The result is a list of 2-tuples in the default rough sequence order from the BibleBooksCodes module.
Each tuple contains ( BBB, filename ) not including the folder path.
"""
resultList = []
if self.pattern and self.fileExtension.upper() not in EXTENSIONS_TO_IGNORE:
if self.pattern == "Dd_BEName": # they are Bibledit style
for USFMBookCode,BibleditDigits,BBB in self._BibleditBooksCodeNumberTriples:
BibleditSignature = BibleditDigits + '_'
for BEFilename in BIBLEDIT_FILENAMES: # this doesn't seem very efficient, but it does work
if BEFilename.startswith( BibleditSignature ):
resultList.append( (BBB,BEFilename+'.'+self.fileExtension,) )
break
elif self.pattern == "dd-OEBName":
for AltFilename in ALTERNATE_FILENAMES:
BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( AltFilename[0:2] )
resultList.append( (BBB,AltFilename+'.'+self.fileExtension,) )
else: # they are Paratext style
for USFMBookCode,USFMDigits,BBB in self._USFMBooksCodeNumberTriples:
filename = '*' * len(self.pattern)
if self.digitsIndex is not None: filename = filename[:self.digitsIndex] + USFMDigits + filename[self.digitsIndex+len(USFMDigits):]
filename = filename[:self.USFMBookCodeIndex] + ( USFMBookCode.upper() if 'BBB' in self.pattern else USFMBookCode ) + filename[self.USFMBookCodeIndex+len(USFMBookCode):]
if self.languageCode: filename = filename[:self.languageIndex] + self.languageCode + filename[self.languageIndex+len(self.languageCode):]
filename += '.' + self.fileExtension
for ix in range( 0, len(filename)): # See if there's any constant characters in the pattern that we need to grab
if filename[ix]=='*' and self.pattern[ix]!='*':
filename = filename[:ix] + self.pattern[ix] + filename[ix+1:]
self.doListAppend( BBB, filename, resultList, "getDerivedFilenameTuples" )
return BibleOrgSysGlobals.BibleBooksCodes.getSequenceList( resultList )
# end of getDerivedFilenameTuples
def getConfirmedFilenameTuples( self, strictCheck=False ):
"""
Starting with the theoretical list of filenames derived from the deduced template (if we have one),
return a list of tuples of UPPER CASE book codes with actual (present and readable) USFM filenames.
If the strictCheck flag is set, the program also looks at the id lines inside the files.
The result is a list of 2-tuples in the default rough sequence order from the BibleBooksCodes module.
Each tuple contains ( BBB, filename ) not including the folder path.
"""
resultList = []
for BBB,derivedFilename in self.getDerivedFilenameTuples():
derivedFilepath = os.path.join( self.givenFolderName, derivedFilename )
if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( ' getConfirmedFilenameTuples: Checking for existence of: ' + derivedFilename )
if os.access( derivedFilepath, os.R_OK ):
if strictCheck:
USFMId = self.getUSFMIDFromFile( self.givenFolderName, derivedFilename, derivedFilepath )
if USFMId is None:
logging.error( "{}internal USFM Id missing for {} in {}".format( 'getConfirmedFilenameTuples: ' if BibleOrgSysGlobals.debugFlag else '', BBB, derivedFilename ) )
continue # so it doesn't get added
BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFMAbbreviation( USFMId )
if BBB != BBB:
logging.error( "{}Internal USFM Id ({}{}) doesn't match {} for {}".format( 'getConfirmedFilenameTuples: ' if BibleOrgSysGlobals.debugFlag else '', USFMId, '' if BBB==USFMId else " -> {}".format(BBB), BBB, derivedFilename ) )
continue # so it doesn't get added
self.doListAppend( BBB, derivedFilename, resultList, "getConfirmedFilenameTuples" )
self.lastTupleList = resultList
return resultList # No need to sort these because the derived ones are sorted
# end of USFMFilenames.getConfirmedFilenameTuples
def getPossibleFilenameTuplesExt( self ):
"""
Return a list of filename tuples just derived from the list of files in the folder,
i.e., look only externally at the filenames.
The result is a list of 2-tuples in the default rough sequence order from the BibleBooksCodes module.
Each tuple contains ( BBB, filename ) not including the folder path.
"""
resultList = []
for possibleFilename in self.fileList:
pFUpper = possibleFilename.upper()
if pFUpper in FILENAMES_TO_IGNORE: continue
pFUpperProper, pFUpperExt = os.path.splitext( pFUpper )
for USFMBookCode,USFMDigits,BBB in self._USFMBooksCodeNumberTriples:
ignore = False
for ending in FILENAME_ENDINGS_TO_IGNORE:
if pFUpper.endswith( ending): ignore=True; break
if ignore: continue
if USFMBookCode.upper() in pFUpperProper:
if pFUpper[-1]!='~' and not pFUpperExt[1:] in EXTENSIONS_TO_IGNORE: # Compare without the first dot
self.doListAppend( BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFMAbbreviation( USFMBookCode ), possibleFilename, resultList, "getPossibleFilenameTuplesExt" )
self.lastTupleList = resultList
return BibleOrgSysGlobals.BibleBooksCodes.getSequenceList( resultList )
# end of USFMFilenames.getPossibleFilenameTuplesExt
def getPossibleFilenameTuplesInt( self ):
"""
Return a list of filename tuples which contain book codes internally on the \\id line.
The result is a list of 2-tuples in the default rough sequence order from the BibleBooksCodes module.
Each tuple contains ( BBB, filename ) not including the folder path.
"""
resultList = []
if len( self._BBBDictionary) >= len( self._fileDictionary ): # Choose the longest one
for BBB in self._BBBDictionary.keys():
self.doListAppend( BBB, self._BBBDictionary[BBB][1], resultList, "getPossibleFilenameTuplesInt1" )
else:
for folder,filename in self._fileDictionary.keys():
assert folder == self.givenFolderName
#print( "getPossibleFilenameTuplesInt", folder, filename, self._fileDictionary )
self.doListAppend( self._fileDictionary[(folder,filename,)], filename, resultList, "getPossibleFilenameTuplesInt2" )
self.lastTupleList = resultList
return BibleOrgSysGlobals.BibleBooksCodes.getSequenceList( resultList )
# end of USFMFilenames.getPossibleFilenameTuplesInt
def getMaximumPossibleFilenameTuples( self, strictCheck=False ):
"""
Find the method that finds the maximum number of USFM Bible files.
The result is a list of 2-tuples in the default rough sequence order from the BibleBooksCodes module.
Each tuple contains ( BBB, filename ) not including the folder path.
"""
#if BibleOrgSysGlobals.debugFlag: print( "getMaximumPossibleFilenameTuples( {} )".format( strictCheck ) )
resultString, resultList = 'Confirmed', self.getConfirmedFilenameTuples()
resultListExt = self.getPossibleFilenameTuplesExt()
if len(resultListExt) > len(resultList):
resultString, resultList = 'External', resultListExt
resultListInt = self.getPossibleFilenameTuplesInt()
if len(resultListInt) > len(resultList):
resultString, resultList = 'Internal', resultListInt
if BibleOrgSysGlobals.verbosityLevel > 2: print( "getMaximumPossibleFilenameTuples: using {}".format( resultString ) )
if strictCheck or BibleOrgSysGlobals.strictCheckingFlag:
#if BibleOrgSysGlobals.debugFlag: print( " getMaximumPossibleFilenameTuples doing strictCheck…" )
for BBB,filename in resultList[:]:
firstLine = BibleOrgSysGlobals.peekIntoFile( filename, self.givenFolderName )
#print( 'UFN', repr(firstLine) )
if firstLine is None: resultList.remove( (BBB,filename) ); continue # seems we couldn't decode the file
if firstLine and firstLine[0]==chr(65279): #U+FEFF or \ufeff
logging.info( "USFMBibleFileCheck: Detected Unicode Byte Order Marker (BOM) in {}".format( filename ) )
firstLine = firstLine[1:] # Remove the Unicode Byte Order Marker (BOM)
if not firstLine or firstLine[0] != '\\': # don't allow a blank first line and must start with a backslash
resultList.remove( (BBB,filename) )
self.lastTupleList = resultList
#print( "getMaximumPossibleFilenameTuples is returning", resultList )
return resultList # No need to sort these, coz all the above calls produce sorted results
# end of USFMFilenames.getMaximumPossibleFilenameTuples
def getUnusedFilenames( self ):
"""
Return a list of filenames which didn't seem to be USFM files.
NOTE: This list depends on which "find" routine above was run last!
The order of the filenames in the list has no meaning.
"""
folderFilenames = os.listdir( self.givenFolderName )
if self.lastTupleList is None: return None # Not sure what list they're after here
#print( len(self.lastTupleList), self.lastTupleList )
for BBB,actualFilename in self.lastTupleList:
#print( BBB, actualFilename )
if actualFilename in folderFilenames: folderFilenames.remove( actualFilename ) # Sometimes it can be removed already if we had (invalid) duplicates in the lastTupleList
return folderFilenames
# end of getUnusedFilenames
def getSSFFilenames( self, searchAbove=False, auto=True ):
"""
Return a list of full pathnames of .ssf files in the folder.
NOTE: USFM projects don't usually have the .ssf files in the project folder,
but 'backed-up' projects often do.
If searchAbove is set to True and no ssf files are found in the given folder,
this routine will attempt to search the next folder up the file hierarchy.
Furthermore, unless auto is set to False,
it will try to find the correct one from multiple SSFs.
"""
def getSSFFilenamesHelper( folder ):
resultPathlist = []
files = os.listdir( folder )
for foundFilename in files:
if not foundFilename.endswith('~'): # Ignore backup files
foundFileBit, foundExtBit = os.path.splitext( foundFilename )
if foundExtBit.lower()=='.ssf':
resultPathlist.append( os.path.join( folder, foundFilename ) )
return resultPathlist
# end of getSSFFilenamesHelper
filelist = getSSFFilenamesHelper( self.givenFolderName )
if not filelist and searchAbove: # try the next level up
filelist = getSSFFilenamesHelper( os.path.join( self.givenFolderName, '../' ) )
if auto and len(filelist)>1: # See if we can help them by automatically choosing the right one
count, index = 0, -1
for j, filepath in enumerate(filelist): # Check if we can find a single matching ssf file
foundPathBit, foundExtBit = os.path.splitext( filepath )
foundPathBit, foundFileBit = os.path.split( foundPathBit )
#print( foundPathBit, foundFileBit, foundExtBit, self.givenFolderName )
if foundFileBit in self.givenFolderName: index = j; count += 1 # Take a guess that this might be the right one
#print( count, index )
if count==1 and index!=-1: filelist = [ filelist[index] ] # Found exactly one so reduce the list down to this one filepath
return filelist
# end of getSSFFilenames
# end of class USFMFiles
def demo():
""" Demonstrate finding files in some USFM Bible folders. """
if BibleOrgSysGlobals.verbosityLevel > 0: print( ProgNameVersion )
# These are relative paths -- you can replace these with your test folder(s)
testFolders = ("Tests/DataFilesForTests/USFMTest1/", "Tests/DataFilesForTests/USFMTest2/",
'Tests/DataFilesForTests/USXTest1/', 'Tests/DataFilesForTests/USXTest2/',
'Tests/DataFilesForTests/USFM-WEB/', 'Tests/DataFilesForTests/USFM-OEB/',
"../../../../AutoProcesses/Processed/","../../../../AutoProcesses/Processed/Test/",
'Tests/DataFilesForTests/USFMErrorProject/',
)
for j, testFolder in enumerate( testFolders ):
print( '\n{}'.format( j+1 ) )
if os.access( testFolder, os.R_OK ):
UFns = USFMFilenames( testFolder )
print( UFns )
result = UFns.getAllFilenames(); print( "\nAll:", len(result), result )
result = UFns.getDerivedFilenameTuples(); print( "\nDerived:", UFns.getFilenameTemplate(), len(result), result )
result = UFns.getConfirmedFilenameTuples(); print( "\nConfirmed:", UFns.getFilenameTemplate(), len(result), result )
result = UFns.getUnusedFilenames(); print( "Unused:", len(result), result )
result = UFns.getConfirmedFilenameTuples( strictCheck=True ); print( "\nConfirmed (with double check):", UFns.getFilenameTemplate(), len(result), result )
result = UFns.getUnusedFilenames(); print( "Unused:", len(result), result )
result = UFns.getPossibleFilenameTuplesExt(); print( "\nPossibleExt:", len(result), result )
result = UFns.getUnusedFilenames(); print( "Unused:", len(result), result )
result = UFns.getPossibleFilenameTuplesInt(); print( "\nPossibleInt:", len(result), result )
result = UFns.getUnusedFilenames(); print( "Unused:", len(result), result )
result = UFns.getMaximumPossibleFilenameTuples(); print( "\nMaxPoss:", len(result), result )
result = UFns.getMaximumPossibleFilenameTuples( strictCheck=True ); print( "\nMaxPoss (strict):", len(result), result )
result = UFns.getUnusedFilenames(); print( "Unused:", len(result), result )
result = UFns.getSSFFilenames(); print( "\nSSF:", len(result), result )
else: print( "Sorry, test folder {!r} doesn't exist on this computer.".format( testFolder ) )
if __name__ == '__main__':
# Configure basic set-up
parser = BibleOrgSysGlobals.setup( ProgName, ProgVersion )
BibleOrgSysGlobals.addStandardOptionsAndProcess( parser )
demo()
BibleOrgSysGlobals.closedown( ProgName, ProgVersion )
# end of USFMFilenames.py