Skip to content

Commit af02104

Browse files
authored
add ocr.py
1 parent 737abaf commit af02104

1 file changed

Lines changed: 86 additions & 0 deletions

File tree

src/main/ocr.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import pytesseract
2+
import unicodedata
3+
import re
4+
import numpy as np
5+
6+
7+
8+
# Define class variables
9+
10+
bounding_box_order = ["left", "top", "right", "bottom"]
11+
12+
# This method will take the model bounding box predictions and return the extracted text inside each box
13+
def one_shot_ocr_service(image, output):
14+
# iterate over detections
15+
response = []
16+
detections = output['bounding-boxes']
17+
18+
for i in range(0, len(detections)):
19+
20+
# crop image for every detection:
21+
coordinates = (detections[i]["coordinates"])
22+
cropped = image.crop((float(coordinates["left"]), float(
23+
coordinates["top"]), float(coordinates["right"]), float(coordinates["bottom"])))
24+
25+
# convert image to grayscale for better accuracy
26+
processed_img=cropped.convert('L')
27+
28+
# extract text with positive confidence from cropped image
29+
df = pytesseract.image_to_data(processed_img, output_type='data.frame')
30+
valid_df = df[df["conf"] > 0]
31+
extracted_text = " ".join(valid_df["text"].values)
32+
33+
# process text
34+
extracted_text = str(unicodedata.normalize('NFKD', extracted_text).encode('ascii', 'ignore').decode()).strip().replace("\n", " ").replace(
35+
"...", ".").replace("..", ".").replace('”', ' ').replace('“', ' ').replace("'", ' ').replace('\"', '').replace("alt/1m", "").strip()
36+
extracted_text = re.sub(
37+
'[^A-Za-z0-9.!?,;%:=()\[\]$€&/\- ]+', '', extracted_text)
38+
extracted_text = " ".join(extracted_text.split())
39+
40+
# wrap each prediction inside a dictionary
41+
if len(extracted_text) is not 0:
42+
prediction = dict()
43+
prediction["text"] = extracted_text
44+
bounding_box = [coordinates[el] for el in bounding_box_order]
45+
prediction["box"] = bounding_box
46+
prediction["score"] = valid_df["conf"].mean()/100.0
47+
48+
response.append(prediction)
49+
50+
return response
51+
52+
# This method will take an image and return the extracted text from the image
53+
def ocr_service(image):
54+
# convert image to grayscale for better accuracy
55+
processed_img=image.convert('L')
56+
57+
# Get data including boxes, confidences, line and page numbers
58+
df = pytesseract.image_to_data(processed_img, output_type='data.frame')
59+
valid_df = df[df["conf"] > 0]
60+
61+
# process text
62+
extracted_text = " ".join(valid_df["text"].values)
63+
extracted_text = str(unicodedata.normalize('NFKD', extracted_text).encode('ascii', 'ignore').decode()).strip().replace("\n", " ").replace(
64+
"...", ".").replace("..", ".").replace('”', ' ').replace('“', ' ').replace("'", ' ').replace('\"', '').replace("alt/1m", "").strip()
65+
extracted_text = re.sub(
66+
'[^A-Za-z0-9.!?,;%:=()\[\]$€&/\- ]+', '', extracted_text)
67+
extracted_text = " ".join(extracted_text.split())
68+
69+
# calculate the bounding box data based on pytesseract results
70+
coordinates = {}
71+
index = valid_df.index.values
72+
coordinates["left"] = valid_df.loc[index[0], "left"]
73+
coordinates["top"] = valid_df.loc[index[0], "top"]
74+
coordinates["bottom"] = valid_df.loc[index[-1],
75+
"top"] + valid_df.loc[index[-1], "height"]
76+
coordinates["right"] = valid_df.loc[index[-1],
77+
"left"] + valid_df.loc[index[-1], "width"]
78+
bounding_box = [coordinates[el].item() for el in bounding_box_order]
79+
80+
# wrap each prediction inside a dictionary
81+
response = {}
82+
response["text"] = extracted_text
83+
response["box"] = bounding_box
84+
response["score"] = valid_df["conf"].mean()/100.0
85+
86+
return [response]

0 commit comments

Comments
 (0)