-
Notifications
You must be signed in to change notification settings - Fork 66
/
pan_card_detect.py
194 lines (159 loc) · 4.93 KB
/
pan_card_detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import os.path
import json
import io
import sys
import string
import pytesseract
import re
import difflib
import csv
import dateutil.parser as dparser
try:
from PIL import Image, ImageEnhance, ImageFilter
except:
print("Please Install PIL - For Python 3 Users the Library is now called Pillow")
sys.exit()
path = sys.argv[1]
img = Image.open(path)
img = img.convert('RGB') #RGBA not supported or required in Python 3 onwards
pix = img.load()
for y in range(img.size[1]):
for x in range(img.size[0]):
if pix[x, y][0] < 102 or pix[x, y][1] < 102 or pix[x, y][2] < 102:
pix[x, y] = (0, 0, 0, 255)
else:
pix[x, y] = (255, 255, 255, 255)
img.save('temp.jpg')
text_in = pytesseract.image_to_string(Image.open('temp.jpg'))
text = list(filter(lambda x: ord(x)<128, text_in)) # TO BE CHECKED
print(text_in)
text_output = open('outputbase.txt', 'w')
text_output.write(text_in)
text_output.close()
file = open('outputbase.txt', 'r')
text = file.read()
#print(text)
# Initializing data variable
name = None
fname = None
dob = None
pan = None
nameline = []
dobline = []
panline = []
text0 = []
text1 = []
text2 = []
# Searching for PAN
lines = text.split('\n')
for lin in lines:
s = lin.strip()
s = s.rstrip()
s = s.lstrip()
text1.append(s)
#text1 = list(text1)
text1 = list(filter(None, text1))
#print(text1)
# List Object Returned in the following order
'''
Note: Hindi has the worst error rates in tesseract and creates noise in image. Tesseract doesn't work well with noisy
data
Reference: https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/35248.pdf
1. Income Tax Department Government of India (the text might be distorted due to quality of image or inherent problems
with tesseractocr and its inability to distinguish seamlessly between languages not native to the module or not as
developed - such as Hindi.)
2. Name of the PAN Card Holder
3. Father's Name
4. Date of Birth in MM/DD/YYYY format as listed in the PAN Card
5. ----Permanent Account Number---- text that is a named entity in the PAN Card (not the actual PAN Card Number)
6. Permanent Account Number in the format ABCDE1234F
7. Signature as normal text - named entity in the PAN Card
'''
lineno=0 # to start from the first line of the text file.
for wordline in text1:
xx = wordline.split('\n')
if ([w for w in xx if re.search('(INCOME|TAX|GOW|GOVT|GOVERNMENT|OVERNMENT|VERNMENT|DEPARTMENT|EPARTMENT|PARTMENT|ARTMENT|INDIA|NDIA)$', w)]):
text1 = list(text1)
lineno = text1.index(wordline)
break
#text1 = list(text1)
text0 = text1[lineno+1:]
#print(text0) #Contains all the relevant extracted text in form of a list - uncomment to check
#-----------Read Database
with open('namedb.csv', 'r') as f:
reader = csv.reader(f)
newlist = list(reader)
newlist = sum(newlist, [])
# Searching for Name and finding closest name in database
try:
for x in text0:
for y in x.split():
if(difflib.get_close_matches(y.upper(), newlist)):
nameline.append(x)
break
except:
pass
try:
name = nameline[0]
fname = nameline[1]
pan = text0[4]
except:
pass
try:
dobline = [item for item in text0 if item not in nameline]
for x in dobline: # dobline contains the date of birth and the PAN Card number, here we're just interested in DOB
z = x.split()
z = [s for s in z if len(s) > 3]
for y in z:
if(dparser.parse(y, fuzzy=True)):
dob = y
panline = dobline[dobline.index(x)+1:]
break
except:
pass
'''try:
for wordline in panline:
# panline now contains the two objects as string, which will be converted to a list using split
xx = wordline.split() # Splits the final two objects of the actual PAN Card number & Signature Entity
if ([w for w in xx if re.search('(Number|umber|Account|ccount|count|Permanent|ermanent|manent)$', w)]):
pan = panline[panline.index(wordline)+1]
break
pan = pan.replace(" ", "")
except:
pass'''
# Making tuples of data
data = {}
data['Name'] = name
data['Father Name'] = fname
data['Date of Birth'] = dob
data['PAN'] = pan
#print(data)
# Writing data into JSON
try:
to_unicode = unicode
except NameError:
to_unicode = str
# Write JSON file
with io.open('data.json', 'w', encoding='utf8') as outfile:
str_ = json.dumps(data,
indent=4, sort_keys=True,
separators=(',', ': '), ensure_ascii=False)
outfile.write(to_unicode(str_))
# Read JSON file
with open('data.json') as data_file:
data_loaded = json.load(data_file)
#print(data == data_loaded)
# Removing dummy files
os.remove('temp.jpg')
# Reading data back JSON(give correct path where JSON is stored)
with open('data.json', 'r') as f:
ndata = json.load(f)
print('\t', "|+++++++++++++++++++++++++++++++|")
print('\t', '|', '\t', ndata['Name'])
print('\t', "|-------------------------------|")
print('\t', '|', '\t', ndata['Father Name'])
print('\t', "|-------------------------------|")
print('\t', '|', '\t', ndata['Date of Birth'])
print('\t', "|-------------------------------|")
print('\t', '|', '\t', ndata['PAN'])
print('\t', "|+++++++++++++++++++++++++++++++|")