import argparse import cv2 import glob import os import multiprocessing import numpy from PIL import Image, ImageFilter from PIL.ImageOps import autocontrast, invert, grayscale, contain, expand import pytesseract import signal import sys # ---- # Program # ---- PROGRAM_NAME = "RoK Reader" PROGRAM_VERSION = "0.3" PROGRAM_DESCRIPTION = "This program reads data from Rise of Kingdoms screenshots. It currently supports three user profile screenshots." # ---- # Classes # ---- class Box: def __init__(self, x, y, x2, y2): self.x = x self.y = y self.x2 = x2 self.y2 = y2 class RelativeBox: def __init__(self, x_distance, y_distance, width, height): self.x_distance = x_distance self.y_distance = y_distance self.width = width self.height = height # ---- # Files # ---- OUTPUT_PATH_PROFILE = "output-profile.csv" OUTPUT_PATH_MOREINFO = "output-more.csv" OUTPUT_PATH_KILLS = "output-kills.csv" # ---- # Coordinates # ---- # Name, Top Left, Bottom Right, Number, Invert, BonusRightTrim PROFILE_TARGETS = [ ("ID", (1246, 375), (1445, 430), True, True, -10), ("Power", (1435, 585), (1733, 634), True, True, 0), ("Kill Points", (1806, 585), (2112, 633), True, True, 0), ("Alliance", (1025, 584), (1427, 637), False, True, 0), ("Civilization", (1884, 420), (2132, 486), False, True, 0) ] MOREINFO_TARGETS = [ ("Power", (1305, 223), (1540, 274), True, True, 0), ("Kill Points", (1931, 222), (2188, 276), True, True, 0), ("Highest Power", (1815, 416), (2105, 483), True, True, 0), ("Victories", (1815, 515), (2105, 580), True, True, 0), ("Defeats", (1815, 613), (2105, 675), True, True, 0), ("Dead", (1815, 710), (2105, 771), True, True, 0), ("Scout Times", (1815, 806), (2105, 871), True, True, 0), ("Resources Gathered", (1815, 980), (2105, 1047), True, True, 0), ("Resource Assistance", (1815, 1077), (2105, 1144), True, True, 0), ("Alliance Help Times", (1815, 1174), (2105, 1238), True, True, 0) ] KILLS_TARGETS = [ ("Kill Points", (1418, 312), (1694, 352), True, False, 0), ("T1 Kills", (1325, 637), (1538, 684), True, False, 0), ("T1 Kill Points", (1986, 637), (2212, 684), True, False, 0), ("T2 Kills", (1325, 702), (1538, 755), True, False, 0), ("T2 Kill Points", (1986, 702), (2212, 755), True, False, 0), ("T3 Kills", (1325, 770), (1538, 824), True, False, 0), ("T3 Kill Points", (1986, 770), (2212, 824), True, False, 0), ("T4 Kills", (1325, 847), (1538, 897), True, False, 0), ("T4 Kill Points", (1986, 847), (2212, 897), True, False, 0), ("T5 Kills", (1325, 918), (1538, 968), True, False, 0), ("T5 Kill Points", (1986, 918), (2212, 968), True, False, 0), ("Previous Kills", (1626, 985), (2228, 1039), False, False, -385) ] # ---- # Functions # ---- # Read an image file def read_file(fileTuple): fileNumber = fileTuple[0] file = fileTuple[1] isDuplicate = fileTuple[2] filename = os.path.basename(file) if "profile" in filename: targets = PROFILE_TARGETS outputPath = OUTPUT_PATH_PROFILE elif "more" in filename: targets = MOREINFO_TARGETS outputPath = OUTPUT_PATH_MOREINFO elif "kills" in filename: targets = KILLS_TARGETS outputPath = OUTPUT_PATH_KILLS else: sys.exit("File name doesn't contain type") # TODO: fix if not isDuplicate or arguments.debug: # Open image and swap to RGB image = Image.open(file) rgbImage = Image.new("RGB", image.size, (255, 255, 255)) rgbImage.paste(image, mask = image.split()[3]) image.close() # Get data outputLine = filename + "\t" debugOutput = "" for i, target in enumerate(targets): debugFile = os.path.splitext(filename)[0] + "_" + str(i) + ".png" string = read_string_from_image(rgbImage, Box(target[1][0], target[1][1], target[2][0], target[2][1]), target[3], target[4], target[5], debugFolder + debugFile) debugOutput = debugOutput + " " + target[0] + ": " + string + "\n" if i: outputLine = outputLine + "\t" outputLine = outputLine + string return (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate) else: return (fileNumber, filename, "", outputPath, "", isDuplicate) # Read text from a section of an image using Tesseract def read_string_from_image(rgbImage, box, is_number, inv, bonusRightTrim, debugFilePath): # Crop to correct dimensions rgbImage = rgbImage.crop((box.x, box.y, box.x2, box.y2)) # Invert if flagged if inv: rgbImage = invert(rgbImage) # Apply Pillow filters to cut off artifacts rgbImage = autocontrast(rgbImage, cutoff=(0, 50)) # Convert to OpenCV npImage=numpy.array(rgbImage) # Set colors to grayscale npImage=cv2.cvtColor(npImage, cv2.COLOR_BGR2GRAY) # Apply OpenCV Filters npImage = cv2.medianBlur(npImage, 3) _, npImage = cv2.threshold(npImage, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) # npImage = cv2.adaptiveThreshold(npImage, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) # Convert to Pillow rgbImage = Image.fromarray(npImage) # Crop to content bbox = invert(rgbImage).getbbox() if bbox: rgbImage = rgbImage.crop((bbox[0], bbox[1], bbox[2] + bonusRightTrim, bbox[3])) if bbox: rgbImage = rgbImage.crop(invert(rgbImage).getbbox()) rgbImage = expand(rgbImage, border=10, fill=255) if arguments.debug: rgbImage.save(debugFilePath) if is_number: return pytesseract.image_to_string(rgbImage, config="--psm 6 -c tessedit_char_whitelist=0123456789,").strip().replace('\n', ' ').replace('\r', '').replace('.', '').replace(',', '').replace('\t', ' ').replace(' ', '') else: return pytesseract.image_to_string(rgbImage, config="--psm 6").strip().replace('\n', ' ').replace('\r', '').replace('\t', ' ') # Write to output file def write_file(): return # Initialize child processes (ignore SIGINT) def mpInitializer(): signal.signal(signal.SIGINT, signal.SIG_IGN) # ---- # Arguments # ---- parser = argparse.ArgumentParser(description = PROGRAM_DESCRIPTION) parser.add_argument("-p", "--project", help = "project name", required = True) parser.add_argument("-f", "--file", help = "file name (globs accepted)", required = True) parser.add_argument("-o", "--output", help = "output file") parser.add_argument("-v", "--verbose", help = "be verbose", default = False, action = "store_true") parser.add_argument("--debug", help = "save debug images", default = False, action = "store_true") arguments = parser.parse_args() # ---- # Program # ---- # TODO: remove globals debugFolder = "debug" + "/" if __name__ == '__main__': # Create project folder projectFolder = "output/" + arguments.project + "/" if not os.path.exists(projectFolder): os.makedirs(projectFolder) # Create debug folder if arguments.debug: if not os.path.exists(debugFolder): os.makedirs(debugFolder) # Get files to read screenshots_to_read = glob.glob(arguments.file, recursive=True) screenshot_count = len(screenshots_to_read) if screenshot_count < 1: sys.exit("No files found.") # Get all previously scraped data # TODO: limit to filenames only alreadyScraped = "" for outputPath in [OUTPUT_PATH_PROFILE, OUTPUT_PATH_MOREINFO, OUTPUT_PATH_KILLS]: with open(projectFolder + outputPath, "w+", newline='', encoding='utf-8') as outputFile: alreadyScraped = alreadyScraped + outputFile.read() # Mark as duplicates for i, file in enumerate(screenshots_to_read): if os.path.basename(file) in alreadyScraped: screenshots_to_read[i] = (i, file, True) else: screenshots_to_read[i] = (i, file, False) # Scrape if arguments.verbose: print("Scraping", screenshot_count, "files") cpuCount = multiprocessing.cpu_count() mpPool = multiprocessing.Pool(cpuCount, initializer=mpInitializer) try: # Returns: (fileNumber, filename, debugOutput, outputPath, outputLine, isDuplicate) for result in mpPool.imap(read_file, screenshots_to_read): if arguments.verbose: print(result[0]+1, "/", screenshot_count, ": ", result[1], sep="") if result[5] and not arguments.debug: if arguments.verbose: print(" ", "already scraped.") else: if arguments.verbose: print(result[2]) if not arguments.debug: with open(projectFolder + result[3], "a+", newline='', encoding='utf-8') as outputFile: outputFile.write(result[4] + "\n") except KeyboardInterrupt: print("Exiting...") mpPool.terminate() finally: mpPool.terminate() mpPool.join()