[ create a new paste ] login | about

Link: http://codepad.org/kgGrvwwP    [ raw code | fork ]

Python, pasted on Feb 24:
#! /usr/bin/env python
#
# Progenic CAPTCHA solver by fgnas
# Developed under Python 2.7.1
#
# Requires:
# PIL 1.1.7 - http://www.pythonware.com/products/pil/ (python-imaging)
# Tesseract OCR - http://code.google.com/p/tesseract-ocr/
#
# Included in this distribution:
# Python-tesseract - https://github.com/hoffstaetter/python-tesseract
#
# File: progenic_captcha.py

import os
import tesseract
import Image

# Main API management interface to solve basic a CAPTCHA
class captcha_manager:
    _captcha_data_handle = None
    _captcha_file_handle = None
    _captcha_temp_file = './__tmp_progenic_captcha__.png'

    # Constructor: optional filename to open image automagically
    def __init__(self, captcha_file = None):
        if captcha_file:
            self.open(captcha_file)

    # Open image file, setup image data ready to be manipulated
    def open(self, captcha_file):
        if not os.path.exists(captcha_file):
            raise IOError('File not found')
        else:
            if os.path.exists(self._captcha_temp_file):
                try:
                    os.remove(self._captcha_temp_file)
                except Exception:
                    raise IOError('Temporary file could not be removed')
                    
            self._captcha_file_handle = Image.open(captcha_file)
            if self._captcha_file_handle.mode != 'RGB':
                self._captcha_file_handle = self._captcha_file_handle.convert('RGB')
            self._captcha_data_handle = self._captcha_file_handle.load()

    # Read CAPTCHA text, return as string (empty string on fail)
    def read(self, explicit_save = False):
        cleaner = captcha_cleaner()
        cleaner.set_data(self._captcha_data_handle, self._captcha_file_handle.size)
        cleaner.clean(int('0x09', 0) + int('0x09', 0) + int('0x09', 0), int('0x52', 0) + int('0x52', 0) + int('0x52', 0))
        crop_extents = cleaner.get_crop_extents(5)
        cleaner.destroy()
        self._captcha_file_handle = self._captcha_file_handle.crop(crop_extents)
        if explicit_save: self._captcha_file_handle.save(self._captcha_temp_file, 'PNG')
        return tesseract.image_to_string(self._captcha_file_handle)
        
    # Destructor: explicit object deletion
    def destroy(self):
        del self._captcha_data_handle
        del self._captcha_file_handle

# Image cleaner class, provides methods to make the CAPTCHA OCR friendly 
class captcha_cleaner:
    _captcha_x = None
    _captcha_y = None
    _captcha_data_handle = None
    _captcha_metadata = [None, None]
        
    # Constructor:
    def __init__(self):
        pass

    # Gets the extents for the bounding box around text from pixels which are known to be text
    def _add_metadata(self, x, y):
        meta = self._captcha_metadata
        if not meta[0]:
            meta[0] = [x, y] 
        else:
            if x < meta[0][0]:
                meta[0][0] = x
            if y < meta[0][1]:
                meta[0][1] = y
        if not meta[1]:
            meta[1] = [x, y]
        else:
            if x > meta[1][0]:
                meta[1][0] = x
            if y > meta[1][1]:
                meta[1][1] = y
        self._captcha_metadata = meta
            
    # Sets up the cleaner object with the input data it needs before processing
    def set_data(self, captcha_data, dimensions):
        if len(dimensions) == 2:
            self._captcha_data_handle = captcha_data
            self._captcha_x = dimensions[0]
            self._captcha_y = dimensions[1]
        else: raise ValueError('Invalid image dimensions')
        
    # Core cleaning method, attempts to remove background noise and leave clean text on a white background
    def clean(self, text_threshold, edge_threshold):
        if (not self._captcha_data_handle) or (not self._captcha_x) or (not self._captcha_y):
            raise ValueError('No image data to clean')
        x = y = 0
        while (x < self._captcha_x) and (y != self._captcha_y):
            if x == self._captcha_x: x = 0
            pixel_data = self._captcha_data_handle[x, y]
            pixel_colour = pixel_data[0] + pixel_data[1] + pixel_data[2]
            if pixel_colour > edge_threshold:
                pixel_data = (255, 255, 255)
            elif pixel_colour > text_threshold:
                edge_detected = False
                for _x, _y in ((x - 1, y + 1), (x - 1, y - 1), (x - 1, y), (x, y - 1), (x, y + 1), (x + 1, y + 1), (x + 1, y - 1), (x + 1, y)):
                    if (_x >= self._captcha_x) or (_x <= 0) or (_y >= self._captcha_y) or (_y <= 0):
                        continue
                    _pixel_data = self._captcha_data_handle[_x, _y]
                    _pixel_colour = _pixel_data[0] + (_pixel_data[1] * 256) + (_pixel_data[2] * 65536)
                    if _pixel_colour < text_threshold:
                        edge_detected = True
                        break
                if not edge_detected: pixel_data = (255, 255, 255)
            else:
                pixel_data = (0, 0, 0)
                self._add_metadata(x, y)
            self._captcha_data_handle[x, y] = pixel_data
            y += 1
            if (y == self._captcha_y) and (x + 1 != self._captcha_x):
                y = 0
                x += 1
    
    # Returns crop extents with padding using data retrieved by _add_metadata()
    def get_crop_extents(self, padding = 5):
        x1, y1 = self._captcha_metadata[0]
        x2, y2 = self._captcha_metadata[1]
        if x1 - padding < 0: x1 = 0
        else: x1 -= padding
        if y1 - padding < 0: y1 = 0
        else: y1 -= padding
        if x2 + padding >= self._captcha_x: x2 = self._captcha_x - 1
        else: x2 += padding
        if y2 + padding >= self._captcha_y: y2 = self._captcha_y - 1
        else: y2 += padding
        return (x1, y1, x2, y2)
    
    # Destructor: explicit object deletion
    def destroy(self):
        del self._captcha_data_handle
        del self._captcha_metadata
        del self._captcha_x
        del self._captcha_y

if __name__ == '__main__':
    captcha = captcha_manager()
    captcha.open('progenic_captcha7.jpg')
    print(captcha.read(True))
    captcha.destroy()
    


Create a new paste based on this one


Comments: