|
@@ -1,9 +1,12 @@
|
1
|
1
|
import argparse
|
2
|
2
|
import glob
|
3
|
3
|
import os
|
|
4
|
+import multiprocessing
|
4
|
5
|
from PIL import Image, ImageFilter
|
5
|
6
|
from PIL.ImageOps import autocontrast, invert, grayscale, contain
|
6
|
7
|
import pytesseract
|
|
8
|
+import signal
|
|
9
|
+import sys
|
7
|
10
|
|
8
|
11
|
# ----
|
9
|
12
|
# Program
|
|
@@ -85,35 +88,84 @@ KILLS_TARGETS = [
|
85
|
88
|
# Functions
|
86
|
89
|
# ----
|
87
|
90
|
|
|
91
|
+# Read an image file
|
|
92
|
+def read_file(fileTuple):
|
|
93
|
+ fileNumber = fileTuple[0]
|
|
94
|
+ file = fileTuple[1]
|
|
95
|
+ isDuplicate = fileTuple[2]
|
|
96
|
+
|
|
97
|
+ filename = os.path.basename(file)
|
|
98
|
+
|
|
99
|
+ if "profile" in filename:
|
|
100
|
+ targets = PROFILE_TARGETS
|
|
101
|
+ outputPath = OUTPUT_PATH_PROFILE
|
|
102
|
+ elif "more" in filename:
|
|
103
|
+ targets = MOREINFO_TARGETS
|
|
104
|
+ outputPath = OUTPUT_PATH_MOREINFO
|
|
105
|
+ elif "kills" in filename:
|
|
106
|
+ targets = KILLS_TARGETS
|
|
107
|
+ outputPath = OUTPUT_PATH_KILLS
|
|
108
|
+ else:
|
|
109
|
+ sys.exit("File name doesn't contain type") # TODO: fix
|
|
110
|
+
|
|
111
|
+ if not isDuplicate or arguments.debug:
|
|
112
|
+ # Open image and swap to RGB
|
|
113
|
+ image = Image.open(file)
|
|
114
|
+ rgbImage = Image.new("RGB", image.size, (255, 255, 255))
|
|
115
|
+ rgbImage.paste(image, mask = image.split()[3])
|
|
116
|
+ image.close()
|
|
117
|
+
|
|
118
|
+ # Get data
|
|
119
|
+ outputLine = filename + "\t"
|
|
120
|
+ debugOutput = ""
|
|
121
|
+ for i, target in enumerate(targets):
|
|
122
|
+ debugFile = os.path.splitext(filename)[0] + "_" + str(i) + ".png"
|
|
123
|
+
|
|
124
|
+ string = read_string_from_image(rgbImage, Box(target[1][0], target[1][1], target[2][0], target[2][1]), target[3], target[4], target[5], debugFolder + debugFile)
|
|
125
|
+ debugOutput = debugOutput + " " + target[0] + ": " + string + "\n"
|
|
126
|
+ if i:
|
|
127
|
+ outputLine = outputLine + "\t"
|
|
128
|
+ outputLine = outputLine + string
|
|
129
|
+ return (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate)
|
|
130
|
+ else:
|
|
131
|
+ return (fileNumber, filename, "", outputPath, "", isDuplicate)
|
|
132
|
+
|
88
|
133
|
# Read text from a section of an image using Tesseract
|
89
|
|
-def read_string_from_image(file, box, is_number, inv, bonusRightTrim, debugFilePath):
|
90
|
|
- with Image.open(file) as image:
|
|
134
|
+def read_string_from_image(rgbImage, box, is_number, inv, bonusRightTrim, debugFilePath):
|
91
|
135
|
# Crop to correct dimentions
|
92
|
|
- image = image.crop((box.x, box.y, box.x2, box.y2))
|
93
|
|
-
|
94
|
|
- # Switch to RGB mode
|
95
|
|
- rgbimage = Image.new("RGB", image.size, (255, 255, 255))
|
96
|
|
- rgbimage.paste(image, mask = image.split()[3])
|
|
136
|
+ rgbImage = rgbImage.crop((box.x, box.y, box.x2, box.y2))
|
97
|
137
|
|
98
|
138
|
# Invert if flagged
|
99
|
|
- if inv: rgbimage = invert(rgbimage)
|
|
139
|
+ if inv: rgbImage = invert(rgbImage)
|
100
|
140
|
|
101
|
141
|
# Apply filters
|
102
|
|
- rgbimage = grayscale(rgbimage)
|
103
|
|
- rgbimage = autocontrast(rgbimage, cutoff=(0, 75))
|
104
|
|
- bbox = autocontrast(invert(rgbimage), cutoff=(0, 90)).getbbox()
|
105
|
|
- if bbox: rgbimage = rgbimage.crop((bbox[0], bbox[1], bbox[2] + bonusRightTrim, bbox[3]))
|
106
|
|
- rgbimage = contain(rgbimage, (800, 800), method=1)
|
107
|
|
- rgbimage = rgbimage.filter(ImageFilter.EDGE_ENHANCE_MORE)
|
108
|
|
- rgbimage = rgbimage.filter(ImageFilter.SHARPEN)
|
|
142
|
+ rgbImage = grayscale(rgbImage)
|
|
143
|
+ rgbImage = autocontrast(rgbImage, cutoff=(0, 75))
|
|
144
|
+
|
|
145
|
+ # Crop to content
|
|
146
|
+ bbox = autocontrast(invert(rgbImage), cutoff=(0, 90)).getbbox()
|
|
147
|
+ if bbox: rgbImage = rgbImage.crop((bbox[0], bbox[1], bbox[2] + bonusRightTrim, bbox[3]))
|
|
148
|
+
|
|
149
|
+ # Resize and sharpen
|
|
150
|
+ rgbImage = contain(rgbImage, (800, 800), method=1)
|
|
151
|
+ rgbImage = rgbImage.filter(ImageFilter.EDGE_ENHANCE_MORE)
|
|
152
|
+ rgbImage = rgbImage.filter(ImageFilter.SHARPEN)
|
109
|
153
|
|
110
|
154
|
if arguments.debug:
|
111
|
|
- rgbimage.save(debugFilePath)
|
|
155
|
+ rgbImage.save(debugFilePath)
|
112
|
156
|
|
113
|
157
|
if is_number:
|
114
|
|
- return pytesseract.image_to_string(rgbimage, config="--psm 6 -c tessedit_char_whitelist=0123456789,").strip().replace('\n', ' ').replace('\r', '').replace('.', '').replace(',', '').replace('\t', ' ').replace(' ', '')
|
|
158
|
+ return pytesseract.image_to_string(rgbImage, config="--psm 6 -c tessedit_char_whitelist=0123456789,").strip().replace('\n', ' ').replace('\r', '').replace('.', '').replace(',', '').replace('\t', ' ').replace(' ', '')
|
115
|
159
|
else:
|
116
|
|
- return pytesseract.image_to_string(rgbimage, config="--psm 6").strip().replace('\n', ' ').replace('\r', '').replace('\t', ' ')
|
|
160
|
+ return pytesseract.image_to_string(rgbImage, config="--psm 6").strip().replace('\n', ' ').replace('\r', '').replace('\t', ' ')
|
|
161
|
+
|
|
162
|
+# Write to output file
|
|
163
|
+def write_file():
|
|
164
|
+ return
|
|
165
|
+
|
|
166
|
+# Initialize child processes (ignore SIGINT)
|
|
167
|
+def mpInitializer():
|
|
168
|
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
|
117
|
169
|
|
118
|
170
|
# ----
|
119
|
171
|
# Arguments
|
|
@@ -131,52 +183,58 @@ arguments = parser.parse_args()
|
131
|
183
|
# Program
|
132
|
184
|
# ----
|
133
|
185
|
|
|
186
|
+# TODO: remove globals
|
|
187
|
+debugFolder = "debug" + "/"
|
|
188
|
+
|
134
|
189
|
if __name__ == '__main__':
|
135
|
190
|
# Create project folder
|
136
|
191
|
projectFolder = "output/" + arguments.project + "/"
|
137
|
192
|
if not os.path.exists(projectFolder):
|
138
|
193
|
os.makedirs(projectFolder)
|
139
|
194
|
|
140
|
|
- debugFolder = "debug" + "/"
|
|
195
|
+ # Create debug folder
|
141
|
196
|
if arguments.debug:
|
142
|
197
|
if not os.path.exists(debugFolder):
|
143
|
198
|
os.makedirs(debugFolder)
|
144
|
199
|
|
145
|
|
- # Get files
|
|
200
|
+ # Get files to read
|
146
|
201
|
screenshots_to_read = glob.glob(arguments.file, recursive=True)
|
147
|
202
|
screenshot_count = len(screenshots_to_read)
|
148
|
203
|
if screenshot_count < 1: sys.exit("No files found.")
|
149
|
204
|
|
150
|
|
- # Scrape
|
151
|
|
- if arguments.verbose: print("Scraping", screenshot_count, "files")
|
|
205
|
+ # Get all previously scraped data # TODO: limit to filenames only
|
|
206
|
+ alreadyScraped = ""
|
|
207
|
+ for outputPath in [OUTPUT_PATH_PROFILE, OUTPUT_PATH_MOREINFO, OUTPUT_PATH_KILLS]:
|
|
208
|
+ with open(projectFolder + outputPath, "r+", newline='', encoding='utf-8') as outputFile:
|
|
209
|
+ alreadyScraped = alreadyScraped + outputFile.read()
|
|
210
|
+
|
|
211
|
+ # Mark as duplicates
|
152
|
212
|
for i, file in enumerate(screenshots_to_read):
|
153
|
|
- filename = os.path.basename(file)
|
154
|
|
-
|
155
|
|
- if arguments.verbose: print(i+1, "/", screenshot_count, ": ", filename, sep="")
|
156
|
|
-
|
157
|
|
- if "profile" in filename:
|
158
|
|
- targets = PROFILE_TARGETS
|
159
|
|
- output = OUTPUT_PATH_PROFILE
|
160
|
|
- elif "more" in filename:
|
161
|
|
- targets = MOREINFO_TARGETS
|
162
|
|
- output = OUTPUT_PATH_MOREINFO
|
163
|
|
- elif "kills" in filename:
|
164
|
|
- targets = KILLS_TARGETS
|
165
|
|
- output = OUTPUT_PATH_KILLS
|
|
213
|
+ if os.path.basename(file) in alreadyScraped:
|
|
214
|
+ screenshots_to_read[i] = (i, file, True)
|
166
|
215
|
else:
|
167
|
|
- sys.exit("File name doesn't contain type")
|
168
|
|
-
|
169
|
|
- # TODO: bad time complexity
|
170
|
|
- exists = False
|
171
|
|
- with open(projectFolder + output, "a+", newline='', encoding='utf-8') as output_file:
|
172
|
|
- if filename not in output_file.read():
|
173
|
|
- if not arguments.debug: output_file.write(filename + "\t")
|
174
|
|
- for i, target in enumerate(targets):
|
175
|
|
- debugFile = os.path.splitext(filename)[0] + "_" + str(i) + ".png"
|
176
|
|
- string = read_string_from_image(file, Box(target[1][0], target[1][1], target[2][0], target[2][1]), target[3], target[4], target[5], debugFolder + debugFile)
|
177
|
|
- if arguments.verbose: print(" ", target[0], ": ", string, sep="")
|
178
|
|
- if i and not arguments.debug: output_file.write("\t")
|
179
|
|
- if not arguments.debug: output_file.write(string)
|
180
|
|
- if not arguments.debug: output_file.write("\n")
|
181
|
|
- else:
|
|
216
|
+ screenshots_to_read[i] = (i, file, False)
|
|
217
|
+
|
|
218
|
+ # Scrape
|
|
219
|
+ if arguments.verbose: print("Scraping", screenshot_count, "files")
|
|
220
|
+
|
|
221
|
+ cpuCount = multiprocessing.cpu_count()
|
|
222
|
+ mpPool = multiprocessing.Pool(cpuCount, initializer=mpInitializer)
|
|
223
|
+
|
|
224
|
+ try:
|
|
225
|
+ # Returns: (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate)
|
|
226
|
+ for result in mpPool.imap(read_file, screenshots_to_read):
|
|
227
|
+ if arguments.verbose: print(result[0]+1, "/", screenshot_count, ": ", result[1], sep="")
|
|
228
|
+ if result[5] and not arguments.debug:
|
182
|
229
|
if arguments.verbose: print(" ", "already scraped.")
|
|
230
|
+ else:
|
|
231
|
+ if arguments.verbose: print(result[2])
|
|
232
|
+ if not arguments.debug:
|
|
233
|
+ with open(projectFolder + result[3], "a+", newline='', encoding='utf-8') as outputFile:
|
|
234
|
+ outputFile.write(result[4] + "\n")
|
|
235
|
+ except KeyboardInterrupt:
|
|
236
|
+ print("Exiting...")
|
|
237
|
+ mpPool.terminate()
|
|
238
|
+ finally:
|
|
239
|
+ mpPool.terminate()
|
|
240
|
+ mpPool.join()
|