import argparse import glob import os import multiprocessing from PIL import Image, ImageFilter from PIL.ImageOps import autocontrast, invert, grayscale, contain import pytesseract import signal import sys # ---- # Program # ---- PROGRAM_NAME = "RoK Reader" PROGRAM_VERSION = "0.3" PROGRAM_DESCRIPTION = "This program reads data from Rise of Kingdoms screenshots. It currently supports three user profile screenshots." # ---- # Classes # ---- class Box: def __init__(self, x, y, x2, y2): self.x = x self.y = y self.x2 = x2 self.y2 = y2 class RelativeBox: def __init__(self, x_distance, y_distance, width, height): self.x_distance = x_distance self.y_distance = y_distance self.width = width self.height = height # ---- # Files # ---- OUTPUT_PATH_PROFILE = "output-profile.csv" OUTPUT_PATH_MOREINFO = "output-more.csv" OUTPUT_PATH_KILLS = "output-kills.csv" # ---- # Coordinates # ---- # Name, Top Left, Bottom Right, Number, Invert, BonusRightTrim PROFILE_TARGETS = [ ("ID", (1246, 375), (1445, 430), True, True, -10), ("Power", (1435, 585), (1733, 634), True, True, 0), ("Kill Points", (1806, 585), (2112, 633), True, True, 0), ("Alliance", (1025, 584), (1427, 637), False, True, 0), ("Civilization", (1884, 420), (2132, 486), False, True, 0) ] MOREINFO_TARGETS = [ ("Power", (1305, 223), (1540, 274), True, True, 0), ("Kill Points", (1931, 222), (2188, 276), True, True, 0), ("Highest Power", (1815, 416), (2105, 483), True, True, 0), ("Victories", (1815, 515), (2105, 580), True, True, 0), ("Defeats", (1815, 613), (2105, 675), True, True, 0), ("Dead", (1815, 710), (2105, 771), True, True, 0), ("Scout Times", (1815, 806), (2105, 871), True, True, 0), ("Resources Gathered", (1815, 980), (2105, 1047), True, True, 0), ("Resource Assistance", (1815, 1077), (2105, 1144), True, True, 0), ("Alliance Help Times", (1815, 1174), (2105, 1238), True, True, 0) ] KILLS_TARGETS = [ ("Kill Points", (1418, 312), (1694, 352), True, False, 0), ("T1 Kills", (1321, 637), (1538, 684), True, False, 0), ("T1 Kill Points", (1986, 637), (2212, 684), True, False, 0), ("T2 Kills", (1321, 702), (1538, 755), True, False, 0), ("T2 Kill Points", (1986, 702), (2212, 755), True, False, 0), ("T3 Kills", (1321, 770), (1538, 824), True, False, 0), ("T3 Kill Points", (1986, 770), (2212, 824), True, False, 0), ("T4 Kills", (1321, 847), (1538, 897), True, False, 0), ("T4 Kill Points", (1986, 847), (2212, 897), True, False, 0), ("T5 Kills", (1321, 918), (1538, 968), True, False, 0), ("T5 Kill Points", (1986, 918), (2212, 968), True, False, 0), ("Previous Kills", (1626, 985), (2228, 1039), False, False, 0) ] # ---- # Functions # ---- # Read an image file def read_file(fileTuple): fileNumber = fileTuple[0] file = fileTuple[1] isDuplicate = fileTuple[2] filename = os.path.basename(file) if "profile" in filename: targets = PROFILE_TARGETS outputPath = OUTPUT_PATH_PROFILE elif "more" in filename: targets = MOREINFO_TARGETS outputPath = OUTPUT_PATH_MOREINFO elif "kills" in filename: targets = KILLS_TARGETS outputPath = OUTPUT_PATH_KILLS else: sys.exit("File name doesn't contain type") # TODO: fix if not isDuplicate or arguments.debug: # Open image and swap to RGB image = Image.open(file) rgbImage = Image.new("RGB", image.size, (255, 255, 255)) rgbImage.paste(image, mask = image.split()[3]) image.close() # Get data outputLine = filename + "\t" debugOutput = "" for i, target in enumerate(targets): debugFile = os.path.splitext(filename)[0] + "_" + str(i) + ".png" string = read_string_from_image(rgbImage, Box(target[1][0], target[1][1], target[2][0], target[2][1]), target[3], target[4], target[5], debugFolder + debugFile) debugOutput = debugOutput + " " + target[0] + ": " + string + "\n" if i: outputLine = outputLine + "\t" outputLine = outputLine + string return (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate) else: return (fileNumber, filename, "", outputPath, "", isDuplicate) # Read text from a section of an image using Tesseract def read_string_from_image(rgbImage, box, is_number, inv, bonusRightTrim, debugFilePath): # Crop to correct dimentions rgbImage = rgbImage.crop((box.x, box.y, box.x2, box.y2)) # Invert if flagged if inv: rgbImage = invert(rgbImage) # Apply filters rgbImage = grayscale(rgbImage) rgbImage = autocontrast(rgbImage, cutoff=(0, 75)) # Crop to content bbox = autocontrast(invert(rgbImage), cutoff=(0, 90)).getbbox() if bbox: rgbImage = rgbImage.crop((bbox[0], bbox[1], bbox[2] + bonusRightTrim, bbox[3])) # Resize and sharpen rgbImage = contain(rgbImage, (800, 800), method=1) rgbImage = rgbImage.filter(ImageFilter.EDGE_ENHANCE_MORE) rgbImage = rgbImage.filter(ImageFilter.SHARPEN) if arguments.debug: rgbImage.save(debugFilePath) if is_number: return pytesseract.image_to_string(rgbImage, config="--psm 6 -c tessedit_char_whitelist=0123456789,").strip().replace('\n', ' ').replace('\r', '').replace('.', '').replace(',', '').replace('\t', ' ').replace(' ', '') else: return pytesseract.image_to_string(rgbImage, config="--psm 6").strip().replace('\n', ' ').replace('\r', '').replace('\t', ' ') # Write to output file def write_file(): return # Initialize child processes (ignore SIGINT) def mpInitializer(): signal.signal(signal.SIGINT, signal.SIG_IGN) # ---- # Arguments # ---- parser = argparse.ArgumentParser(description = PROGRAM_DESCRIPTION) parser.add_argument("-p", "--project", help = "project name", required = True) parser.add_argument("-f", "--file", help = "file name (globs accepted)", required = True) parser.add_argument("-o", "--output", help = "output file") parser.add_argument("-v", "--verbose", help = "be verbose", default = False, action = "store_true") parser.add_argument("--debug", help = "save debug images", default = False, action = "store_true") arguments = parser.parse_args() # ---- # Program # ---- # TODO: remove globals debugFolder = "debug" + "/" if __name__ == '__main__': # Create project folder projectFolder = "output/" + arguments.project + "/" if not os.path.exists(projectFolder): os.makedirs(projectFolder) # Create debug folder if arguments.debug: if not os.path.exists(debugFolder): os.makedirs(debugFolder) # Get files to read screenshots_to_read = glob.glob(arguments.file, recursive=True) screenshot_count = len(screenshots_to_read) if screenshot_count < 1: sys.exit("No files found.") # Get all previously scraped data # TODO: limit to filenames only alreadyScraped = "" for outputPath in [OUTPUT_PATH_PROFILE, OUTPUT_PATH_MOREINFO, OUTPUT_PATH_KILLS]: with open(projectFolder + outputPath, "r+", newline='', encoding='utf-8') as outputFile: alreadyScraped = alreadyScraped + outputFile.read() # Mark as duplicates for i, file in enumerate(screenshots_to_read): if os.path.basename(file) in alreadyScraped: screenshots_to_read[i] = (i, file, True) else: screenshots_to_read[i] = (i, file, False) # Scrape if arguments.verbose: print("Scraping", screenshot_count, "files") cpuCount = multiprocessing.cpu_count() mpPool = multiprocessing.Pool(cpuCount, initializer=mpInitializer) try: # Returns: (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate) for result in mpPool.imap(read_file, screenshots_to_read): if arguments.verbose: print(result[0]+1, "/", screenshot_count, ": ", result[1], sep="") if result[5] and not arguments.debug: if arguments.verbose: print(" ", "already scraped.") else: if arguments.verbose: print(result[2]) if not arguments.debug: with open(projectFolder + result[3], "a+", newline='', encoding='utf-8') as outputFile: outputFile.write(result[4] + "\n") except KeyboardInterrupt: print("Exiting...") mpPool.terminate() finally: mpPool.terminate() mpPool.join()