Bladeren bron

Add parallel processing

Runs in parallel, corresponding the number of CPU cores on the computer.
Each process handles a single image file.
main
Ashton Charbonneau 3 jaren geleden
bovenliggende
commit
484b68cd11
1 gewijzigde bestanden met toevoegingen van 108 en 50 verwijderingen
  1. 108
    50
      rok-reader.py

+ 108
- 50
rok-reader.py Bestand weergeven

@@ -1,9 +1,12 @@
1 1
 import argparse
2 2
 import glob
3 3
 import os
4
+import multiprocessing
4 5
 from PIL import Image, ImageFilter
5 6
 from PIL.ImageOps import autocontrast, invert, grayscale, contain
6 7
 import pytesseract
8
+import signal
9
+import sys
7 10
 
8 11
 # ----
9 12
 # Program
@@ -85,35 +88,84 @@ KILLS_TARGETS = [
85 88
 # Functions
86 89
 # ----
87 90
 
91
+# Read an image file
92
+def read_file(fileTuple):
93
+    fileNumber = fileTuple[0]
94
+    file = fileTuple[1]
95
+    isDuplicate = fileTuple[2]
96
+
97
+    filename = os.path.basename(file)
98
+
99
+    if "profile" in filename:
100
+        targets = PROFILE_TARGETS
101
+        outputPath = OUTPUT_PATH_PROFILE
102
+    elif "more" in filename:
103
+        targets = MOREINFO_TARGETS
104
+        outputPath = OUTPUT_PATH_MOREINFO
105
+    elif "kills" in filename:
106
+        targets = KILLS_TARGETS
107
+        outputPath = OUTPUT_PATH_KILLS
108
+    else:
109
+        sys.exit("File name doesn't contain type") # TODO: fix
110
+
111
+    if not isDuplicate or arguments.debug:
112
+        # Open image and swap to RGB
113
+        image = Image.open(file)
114
+        rgbImage = Image.new("RGB", image.size, (255, 255, 255))
115
+        rgbImage.paste(image, mask = image.split()[3])
116
+        image.close()
117
+
118
+        # Get data
119
+        outputLine = filename + "\t"
120
+        debugOutput = ""
121
+        for i, target in enumerate(targets):
122
+            debugFile = os.path.splitext(filename)[0] + "_" + str(i) + ".png"
123
+
124
+            string = read_string_from_image(rgbImage, Box(target[1][0], target[1][1], target[2][0], target[2][1]), target[3], target[4], target[5], debugFolder + debugFile)
125
+            debugOutput = debugOutput + "  " + target[0] + ": " + string + "\n"
126
+            if i:
127
+                outputLine = outputLine + "\t"
128
+            outputLine = outputLine + string
129
+        return (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate)
130
+    else:
131
+        return (fileNumber, filename, "", outputPath, "", isDuplicate)
132
+
88 133
 # Read text from a section of an image using Tesseract
89
-def read_string_from_image(file, box, is_number, inv, bonusRightTrim, debugFilePath):
90
-    with Image.open(file) as image:
134
+def read_string_from_image(rgbImage, box, is_number, inv, bonusRightTrim, debugFilePath):
91 135
         # Crop to correct dimentions
92
-        image = image.crop((box.x, box.y, box.x2, box.y2))
93
-
94
-        # Switch to RGB mode
95
-        rgbimage = Image.new("RGB", image.size, (255, 255, 255))
96
-        rgbimage.paste(image, mask = image.split()[3])
136
+        rgbImage = rgbImage.crop((box.x, box.y, box.x2, box.y2))
97 137
 
98 138
         # Invert if flagged
99
-        if inv: rgbimage = invert(rgbimage)
139
+        if inv: rgbImage = invert(rgbImage)
100 140
 
101 141
         # Apply filters
102
-        rgbimage = grayscale(rgbimage)
103
-        rgbimage = autocontrast(rgbimage, cutoff=(0, 75))
104
-        bbox = autocontrast(invert(rgbimage), cutoff=(0, 90)).getbbox()
105
-        if bbox: rgbimage = rgbimage.crop((bbox[0], bbox[1], bbox[2] + bonusRightTrim, bbox[3]))
106
-        rgbimage = contain(rgbimage, (800, 800), method=1)
107
-        rgbimage = rgbimage.filter(ImageFilter.EDGE_ENHANCE_MORE)
108
-        rgbimage = rgbimage.filter(ImageFilter.SHARPEN)
142
+        rgbImage = grayscale(rgbImage)
143
+        rgbImage = autocontrast(rgbImage, cutoff=(0, 75))
144
+
145
+        # Crop to content
146
+        bbox = autocontrast(invert(rgbImage), cutoff=(0, 90)).getbbox()
147
+        if bbox: rgbImage = rgbImage.crop((bbox[0], bbox[1], bbox[2] + bonusRightTrim, bbox[3]))
148
+
149
+        # Resize and sharpen
150
+        rgbImage = contain(rgbImage, (800, 800), method=1)
151
+        rgbImage = rgbImage.filter(ImageFilter.EDGE_ENHANCE_MORE)
152
+        rgbImage = rgbImage.filter(ImageFilter.SHARPEN)
109 153
 
110 154
         if arguments.debug:
111
-            rgbimage.save(debugFilePath)
155
+            rgbImage.save(debugFilePath)
112 156
 
113 157
         if is_number:
114
-            return pytesseract.image_to_string(rgbimage, config="--psm 6 -c tessedit_char_whitelist=0123456789,").strip().replace('\n', ' ').replace('\r', '').replace('.', '').replace(',', '').replace('\t', ' ').replace(' ', '')
158
+            return pytesseract.image_to_string(rgbImage, config="--psm 6 -c tessedit_char_whitelist=0123456789,").strip().replace('\n', ' ').replace('\r', '').replace('.', '').replace(',', '').replace('\t', ' ').replace(' ', '')
115 159
         else: 
116
-            return pytesseract.image_to_string(rgbimage, config="--psm 6").strip().replace('\n', ' ').replace('\r', '').replace('\t', ' ')
160
+            return pytesseract.image_to_string(rgbImage, config="--psm 6").strip().replace('\n', ' ').replace('\r', '').replace('\t', ' ')
161
+
162
+# Write to output file
163
+def write_file():
164
+    return
165
+
166
+# Initialize child processes (ignore SIGINT)
167
+def mpInitializer():
168
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
117 169
 
118 170
 # ----
119 171
 # Arguments
@@ -131,52 +183,58 @@ arguments = parser.parse_args()
131 183
 # Program
132 184
 # ----
133 185
 
186
+# TODO: remove globals
187
+debugFolder = "debug" + "/"
188
+
134 189
 if __name__ == '__main__':
135 190
     # Create project folder
136 191
     projectFolder = "output/" + arguments.project + "/"
137 192
     if not os.path.exists(projectFolder):
138 193
         os.makedirs(projectFolder)
139 194
 
140
-    debugFolder = "debug" + "/"
195
+    # Create debug folder
141 196
     if arguments.debug:
142 197
         if not os.path.exists(debugFolder):
143 198
             os.makedirs(debugFolder)
144 199
 
145
-    # Get files
200
+    # Get files to read
146 201
     screenshots_to_read = glob.glob(arguments.file, recursive=True)
147 202
     screenshot_count = len(screenshots_to_read)
148 203
     if screenshot_count < 1: sys.exit("No files found.")
149 204
 
150
-    # Scrape
151
-    if arguments.verbose: print("Scraping", screenshot_count, "files")
205
+    # Get all previously scraped data # TODO: limit to filenames only
206
+    alreadyScraped = ""
207
+    for outputPath in [OUTPUT_PATH_PROFILE, OUTPUT_PATH_MOREINFO, OUTPUT_PATH_KILLS]:
208
+        with open(projectFolder + outputPath, "r+", newline='', encoding='utf-8') as outputFile:
209
+            alreadyScraped = alreadyScraped + outputFile.read()
210
+
211
+    # Mark as duplicates
152 212
     for i, file in enumerate(screenshots_to_read):
153
-        filename = os.path.basename(file)
154
-
155
-        if arguments.verbose: print(i+1, "/", screenshot_count, ": ", filename, sep="")
156
-
157
-        if "profile" in filename:
158
-            targets = PROFILE_TARGETS
159
-            output = OUTPUT_PATH_PROFILE
160
-        elif "more" in filename:
161
-            targets = MOREINFO_TARGETS
162
-            output = OUTPUT_PATH_MOREINFO
163
-        elif "kills" in filename:
164
-            targets = KILLS_TARGETS
165
-            output = OUTPUT_PATH_KILLS
213
+        if os.path.basename(file) in alreadyScraped:
214
+            screenshots_to_read[i] = (i, file, True)
166 215
         else:
167
-            sys.exit("File name doesn't contain type")
168
-
169
-        # TODO: bad time complexity
170
-        exists = False
171
-        with open(projectFolder + output, "a+", newline='', encoding='utf-8') as output_file:
172
-            if filename not in output_file.read():
173
-                if not arguments.debug: output_file.write(filename + "\t")
174
-                for i, target in enumerate(targets):
175
-                    debugFile = os.path.splitext(filename)[0] + "_" + str(i) + ".png"
176
-                    string = read_string_from_image(file, Box(target[1][0], target[1][1], target[2][0], target[2][1]), target[3], target[4], target[5], debugFolder + debugFile)
177
-                    if arguments.verbose: print("  ", target[0], ": ", string, sep="")
178
-                    if i and not arguments.debug: output_file.write("\t")
179
-                    if not arguments.debug: output_file.write(string)
180
-                if not arguments.debug: output_file.write("\n")
181
-            else:
216
+            screenshots_to_read[i] = (i, file, False)
217
+
218
+    # Scrape
219
+    if arguments.verbose: print("Scraping", screenshot_count, "files")
220
+
221
+    cpuCount = multiprocessing.cpu_count()
222
+    mpPool = multiprocessing.Pool(cpuCount, initializer=mpInitializer)
223
+
224
+    try:
225
+        # Returns: (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate)
226
+        for result in mpPool.imap(read_file, screenshots_to_read):
227
+            if arguments.verbose: print(result[0]+1, "/", screenshot_count, ": ", result[1], sep="")
228
+            if result[5] and not arguments.debug:
182 229
                 if arguments.verbose: print("  ", "already scraped.")
230
+            else:
231
+                if arguments.verbose: print(result[2])
232
+                if not arguments.debug:
233
+                    with open(projectFolder + result[3], "a+", newline='', encoding='utf-8') as outputFile:
234
+                        outputFile.write(result[4] + "\n")
235
+    except KeyboardInterrupt:
236
+        print("Exiting...")
237
+        mpPool.terminate()
238
+    finally:
239
+        mpPool.terminate()
240
+        mpPool.join()    

Laden…
Annuleren
Opslaan