Browse Source

Add parallel processing

Runs in parallel, corresponding the number of CPU cores on the computer.
Each process handles a single image file.
main
Ashton Charbonneau 2 years ago
parent
commit
484b68cd11
1 changed files with 108 additions and 50 deletions
  1. 108
    50
      rok-reader.py

+ 108
- 50
rok-reader.py View File

1
 import argparse
1
 import argparse
2
 import glob
2
 import glob
3
 import os
3
 import os
4
+import multiprocessing
4
 from PIL import Image, ImageFilter
5
 from PIL import Image, ImageFilter
5
 from PIL.ImageOps import autocontrast, invert, grayscale, contain
6
 from PIL.ImageOps import autocontrast, invert, grayscale, contain
6
 import pytesseract
7
 import pytesseract
8
+import signal
9
+import sys
7
 
10
 
8
 # ----
11
 # ----
9
 # Program
12
 # Program
85
 # Functions
88
 # Functions
86
 # ----
89
 # ----
87
 
90
 
91
+# Read an image file
92
+def read_file(fileTuple):
93
+    fileNumber = fileTuple[0]
94
+    file = fileTuple[1]
95
+    isDuplicate = fileTuple[2]
96
+
97
+    filename = os.path.basename(file)
98
+
99
+    if "profile" in filename:
100
+        targets = PROFILE_TARGETS
101
+        outputPath = OUTPUT_PATH_PROFILE
102
+    elif "more" in filename:
103
+        targets = MOREINFO_TARGETS
104
+        outputPath = OUTPUT_PATH_MOREINFO
105
+    elif "kills" in filename:
106
+        targets = KILLS_TARGETS
107
+        outputPath = OUTPUT_PATH_KILLS
108
+    else:
109
+        sys.exit("File name doesn't contain type") # TODO: fix
110
+
111
+    if not isDuplicate or arguments.debug:
112
+        # Open image and swap to RGB
113
+        image = Image.open(file)
114
+        rgbImage = Image.new("RGB", image.size, (255, 255, 255))
115
+        rgbImage.paste(image, mask = image.split()[3])
116
+        image.close()
117
+
118
+        # Get data
119
+        outputLine = filename + "\t"
120
+        debugOutput = ""
121
+        for i, target in enumerate(targets):
122
+            debugFile = os.path.splitext(filename)[0] + "_" + str(i) + ".png"
123
+
124
+            string = read_string_from_image(rgbImage, Box(target[1][0], target[1][1], target[2][0], target[2][1]), target[3], target[4], target[5], debugFolder + debugFile)
125
+            debugOutput = debugOutput + "  " + target[0] + ": " + string + "\n"
126
+            if i:
127
+                outputLine = outputLine + "\t"
128
+            outputLine = outputLine + string
129
+        return (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate)
130
+    else:
131
+        return (fileNumber, filename, "", outputPath, "", isDuplicate)
132
+
88
 # Read text from a section of an image using Tesseract
133
 # Read text from a section of an image using Tesseract
89
-def read_string_from_image(file, box, is_number, inv, bonusRightTrim, debugFilePath):
90
-    with Image.open(file) as image:
134
+def read_string_from_image(rgbImage, box, is_number, inv, bonusRightTrim, debugFilePath):
91
         # Crop to correct dimentions
135
         # Crop to correct dimentions
92
-        image = image.crop((box.x, box.y, box.x2, box.y2))
93
-
94
-        # Switch to RGB mode
95
-        rgbimage = Image.new("RGB", image.size, (255, 255, 255))
96
-        rgbimage.paste(image, mask = image.split()[3])
136
+        rgbImage = rgbImage.crop((box.x, box.y, box.x2, box.y2))
97
 
137
 
98
         # Invert if flagged
138
         # Invert if flagged
99
-        if inv: rgbimage = invert(rgbimage)
139
+        if inv: rgbImage = invert(rgbImage)
100
 
140
 
101
         # Apply filters
141
         # Apply filters
102
-        rgbimage = grayscale(rgbimage)
103
-        rgbimage = autocontrast(rgbimage, cutoff=(0, 75))
104
-        bbox = autocontrast(invert(rgbimage), cutoff=(0, 90)).getbbox()
105
-        if bbox: rgbimage = rgbimage.crop((bbox[0], bbox[1], bbox[2] + bonusRightTrim, bbox[3]))
106
-        rgbimage = contain(rgbimage, (800, 800), method=1)
107
-        rgbimage = rgbimage.filter(ImageFilter.EDGE_ENHANCE_MORE)
108
-        rgbimage = rgbimage.filter(ImageFilter.SHARPEN)
142
+        rgbImage = grayscale(rgbImage)
143
+        rgbImage = autocontrast(rgbImage, cutoff=(0, 75))
144
+
145
+        # Crop to content
146
+        bbox = autocontrast(invert(rgbImage), cutoff=(0, 90)).getbbox()
147
+        if bbox: rgbImage = rgbImage.crop((bbox[0], bbox[1], bbox[2] + bonusRightTrim, bbox[3]))
148
+
149
+        # Resize and sharpen
150
+        rgbImage = contain(rgbImage, (800, 800), method=1)
151
+        rgbImage = rgbImage.filter(ImageFilter.EDGE_ENHANCE_MORE)
152
+        rgbImage = rgbImage.filter(ImageFilter.SHARPEN)
109
 
153
 
110
         if arguments.debug:
154
         if arguments.debug:
111
-            rgbimage.save(debugFilePath)
155
+            rgbImage.save(debugFilePath)
112
 
156
 
113
         if is_number:
157
         if is_number:
114
-            return pytesseract.image_to_string(rgbimage, config="--psm 6 -c tessedit_char_whitelist=0123456789,").strip().replace('\n', ' ').replace('\r', '').replace('.', '').replace(',', '').replace('\t', ' ').replace(' ', '')
158
+            return pytesseract.image_to_string(rgbImage, config="--psm 6 -c tessedit_char_whitelist=0123456789,").strip().replace('\n', ' ').replace('\r', '').replace('.', '').replace(',', '').replace('\t', ' ').replace(' ', '')
115
         else: 
159
         else: 
116
-            return pytesseract.image_to_string(rgbimage, config="--psm 6").strip().replace('\n', ' ').replace('\r', '').replace('\t', ' ')
160
+            return pytesseract.image_to_string(rgbImage, config="--psm 6").strip().replace('\n', ' ').replace('\r', '').replace('\t', ' ')
161
+
162
+# Write to output file
163
+def write_file():
164
+    return
165
+
166
+# Initialize child processes (ignore SIGINT)
167
+def mpInitializer():
168
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
117
 
169
 
118
 # ----
170
 # ----
119
 # Arguments
171
 # Arguments
131
 # Program
183
 # Program
132
 # ----
184
 # ----
133
 
185
 
186
+# TODO: remove globals
187
+debugFolder = "debug" + "/"
188
+
134
 if __name__ == '__main__':
189
 if __name__ == '__main__':
135
     # Create project folder
190
     # Create project folder
136
     projectFolder = "output/" + arguments.project + "/"
191
     projectFolder = "output/" + arguments.project + "/"
137
     if not os.path.exists(projectFolder):
192
     if not os.path.exists(projectFolder):
138
         os.makedirs(projectFolder)
193
         os.makedirs(projectFolder)
139
 
194
 
140
-    debugFolder = "debug" + "/"
195
+    # Create debug folder
141
     if arguments.debug:
196
     if arguments.debug:
142
         if not os.path.exists(debugFolder):
197
         if not os.path.exists(debugFolder):
143
             os.makedirs(debugFolder)
198
             os.makedirs(debugFolder)
144
 
199
 
145
-    # Get files
200
+    # Get files to read
146
     screenshots_to_read = glob.glob(arguments.file, recursive=True)
201
     screenshots_to_read = glob.glob(arguments.file, recursive=True)
147
     screenshot_count = len(screenshots_to_read)
202
     screenshot_count = len(screenshots_to_read)
148
     if screenshot_count < 1: sys.exit("No files found.")
203
     if screenshot_count < 1: sys.exit("No files found.")
149
 
204
 
150
-    # Scrape
151
-    if arguments.verbose: print("Scraping", screenshot_count, "files")
205
+    # Get all previously scraped data # TODO: limit to filenames only
206
+    alreadyScraped = ""
207
+    for outputPath in [OUTPUT_PATH_PROFILE, OUTPUT_PATH_MOREINFO, OUTPUT_PATH_KILLS]:
208
+        with open(projectFolder + outputPath, "r+", newline='', encoding='utf-8') as outputFile:
209
+            alreadyScraped = alreadyScraped + outputFile.read()
210
+
211
+    # Mark as duplicates
152
     for i, file in enumerate(screenshots_to_read):
212
     for i, file in enumerate(screenshots_to_read):
153
-        filename = os.path.basename(file)
154
-
155
-        if arguments.verbose: print(i+1, "/", screenshot_count, ": ", filename, sep="")
156
-
157
-        if "profile" in filename:
158
-            targets = PROFILE_TARGETS
159
-            output = OUTPUT_PATH_PROFILE
160
-        elif "more" in filename:
161
-            targets = MOREINFO_TARGETS
162
-            output = OUTPUT_PATH_MOREINFO
163
-        elif "kills" in filename:
164
-            targets = KILLS_TARGETS
165
-            output = OUTPUT_PATH_KILLS
213
+        if os.path.basename(file) in alreadyScraped:
214
+            screenshots_to_read[i] = (i, file, True)
166
         else:
215
         else:
167
-            sys.exit("File name doesn't contain type")
168
-
169
-        # TODO: bad time complexity
170
-        exists = False
171
-        with open(projectFolder + output, "a+", newline='', encoding='utf-8') as output_file:
172
-            if filename not in output_file.read():
173
-                if not arguments.debug: output_file.write(filename + "\t")
174
-                for i, target in enumerate(targets):
175
-                    debugFile = os.path.splitext(filename)[0] + "_" + str(i) + ".png"
176
-                    string = read_string_from_image(file, Box(target[1][0], target[1][1], target[2][0], target[2][1]), target[3], target[4], target[5], debugFolder + debugFile)
177
-                    if arguments.verbose: print("  ", target[0], ": ", string, sep="")
178
-                    if i and not arguments.debug: output_file.write("\t")
179
-                    if not arguments.debug: output_file.write(string)
180
-                if not arguments.debug: output_file.write("\n")
181
-            else:
216
+            screenshots_to_read[i] = (i, file, False)
217
+
218
+    # Scrape
219
+    if arguments.verbose: print("Scraping", screenshot_count, "files")
220
+
221
+    cpuCount = multiprocessing.cpu_count()
222
+    mpPool = multiprocessing.Pool(cpuCount, initializer=mpInitializer)
223
+
224
+    try:
225
+        # Returns: (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate)
226
+        for result in mpPool.imap(read_file, screenshots_to_read):
227
+            if arguments.verbose: print(result[0]+1, "/", screenshot_count, ": ", result[1], sep="")
228
+            if result[5] and not arguments.debug:
182
                 if arguments.verbose: print("  ", "already scraped.")
229
                 if arguments.verbose: print("  ", "already scraped.")
230
+            else:
231
+                if arguments.verbose: print(result[2])
232
+                if not arguments.debug:
233
+                    with open(projectFolder + result[3], "a+", newline='', encoding='utf-8') as outputFile:
234
+                        outputFile.write(result[4] + "\n")
235
+    except KeyboardInterrupt:
236
+        print("Exiting...")
237
+        mpPool.terminate()
238
+    finally:
239
+        mpPool.terminate()
240
+        mpPool.join()    

Loading…
Cancel
Save