pytesseract识别验证码

验证码破解是做爬虫经常要面对的问题。对于一般字符串或者算式的验证码，可以使用网页截图，然后OCR识别的方式来识别出验证码字符串来。

网页截图可以使用selenium的save_screenshot()方法；
用PIL图像处理库来截取验证码图片；
用Tesseract来做OCR识别

要安装的python第三方库有：selenium、pillow（PIL）、pytesseract

要安装的软件有：selenium用的webdriver、pytesseract用的Tesseract

1. 网页截图并截取验证码

import selenium

from PIL import Image

...

fileScreenshot = "截屏文件保存路径"

fileCaptcha = "验证码图片保存路径"

self.logger.debug("屏幕截图: %s" % fileScreenshot)

self.webdriver.save_screenshot(fileScreenshot)

# 计算验证码图片的位置大小

imgCaptcha = self.webdriver.find_element_by_id("验证码img标签的id")

loc = imgCaptcha.location

size = imgCaptcha.size

left = int(loc['x'])

top = int(loc['y'])

right = int(loc['x'] + size['width'])

bottom = int(loc['y'] + size['height'])

self.logger.debug("截取验证码: %s" % fileCaptcha)

image = Image.open(fileScreenshot)

image = image.crop((left, top, right, bottom))

image.save(fileCaptcha)

import selenium from PIL import Image ... fileScreenshot = "截屏文件保存路径" fileCaptcha = "验证码图片保存路径" self.logger.debug("屏幕截图: %s" % fileScreenshot) self.webdriver.save_screenshot(fileScreenshot) # 计算验证码图片的位置大小 imgCaptcha = self.webdriver.find_element_by_id("验证码img标签的id") loc = imgCaptcha.location size = imgCaptcha.size left = int(loc['x']) top = int(loc['y']) right = int(loc['x'] + size['width']) bottom = int(loc['y'] + size['height']) self.logger.debug("截取验证码: %s" % fileCaptcha) image = Image.open(fileScreenshot) image = image.crop((left, top, right, bottom)) image.save(fileCaptcha)

import selenium
from PIL import Image

...

fileScreenshot = "截屏文件保存路径"
fileCaptcha = "验证码图片保存路径"

self.logger.debug("屏幕截图: %s" % fileScreenshot)
self.webdriver.save_screenshot(fileScreenshot)


# 计算验证码图片的位置大小
imgCaptcha = self.webdriver.find_element_by_id("验证码img标签的id")
loc = imgCaptcha.location
size = imgCaptcha.size
left = int(loc['x'])
top = int(loc['y'])
right = int(loc['x'] + size['width'])
bottom = int(loc['y'] + size['height'])

self.logger.debug("截取验证码: %s" % fileCaptcha)
image = Image.open(fileScreenshot)
image = image.crop((left, top, right, bottom))
image.save(fileCaptcha)

2. 对验证码图片进行OCR识别（包括预处理）

# -*- coding: utf-8 -*-

import logging

import logging.config

import pytesseract

from PIL import Image

class DeCaptcha(object):

"""验证码破解"""

def __init__(self):

"""构造函数"""

super(DeCaptcha, self).__init__()

self.logger = logging.getLogger(self.__class__.__name__)

self.logger.debug("Init DeCaptcha instance")

pass

def crack(self, imgCaptcha):

"""通过OCR识别验证码

Args:

imgCaptcha: PIL Image类型的图像实例

Returns:

识别出来的字符串

"""

#code = pytesseract.image_to_string(imgCaptcha, config="-psm 7")

code = pytesseract.image_to_string(self.binarize(imgCaptcha), config="-psm 7")

self.logger.debug("OCR识别结果: %s" % code)

return code

def binarize(self, imgCaptcha):

"""将图像二值化（黑白）

Args:

imgCaptcha: PIL Image类型的图像实例

Returns:

PIL Image类型的黑白图像

"""

# 转换成灰度图

image = imgCaptcha.convert('L')

# 创建二值化映射表

threshold = 130

table = []

for i in range(256):

if i < threshold:

table.append(0)

else:

table.append(1)

# 二值化

return image.point(table, '1')

def main():

logging.config.fileConfig("config/logging.config")

captchaFile = "验证码图片路径"

image = Image.open(captchaFile)

deCaptcha = DeCaptcha()

deCaptcha.crack(image)

pass

if __name__ == "__main__":

main()

# -*- coding: utf-8 -*- import logging import logging.config import pytesseract from PIL import Image class DeCaptcha(object): """验证码破解""" def __init__(self): """构造函数""" super(DeCaptcha, self).__init__() self.logger = logging.getLogger(self.__class__.__name__) self.logger.debug("Init DeCaptcha instance") pass def crack(self, imgCaptcha): """通过OCR识别验证码 Args: imgCaptcha: PIL Image类型的图像实例 Returns: 识别出来的字符串 """ #code = pytesseract.image_to_string(imgCaptcha, config="-psm 7") code = pytesseract.image_to_string(self.binarize(imgCaptcha), config="-psm 7") self.logger.debug("OCR识别结果: %s" % code) return code def binarize(self, imgCaptcha): """将图像二值化（黑白） Args: imgCaptcha: PIL Image类型的图像实例 Returns: PIL Image类型的黑白图像 """ # 转换成灰度图 image = imgCaptcha.convert('L') # 创建二值化映射表 threshold = 130 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) # 二值化 return image.point(table, '1') def main(): logging.config.fileConfig("config/logging.config") captchaFile = "验证码图片路径" image = Image.open(captchaFile) deCaptcha = DeCaptcha() deCaptcha.crack(image) pass if __name__ == "__main__": main()

# -*- coding: utf-8 -*-

import logging
import logging.config
import pytesseract
from PIL import Image


class DeCaptcha(object):
    """验证码破解"""
    
    def __init__(self):
        """构造函数"""
        super(DeCaptcha, self).__init__()

        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.debug("Init DeCaptcha instance")
        pass

    def crack(self, imgCaptcha):
        """通过OCR识别验证码
        Args:
            imgCaptcha: PIL Image类型的图像实例
        Returns:
            识别出来的字符串
        """

        #code = pytesseract.image_to_string(imgCaptcha, config="-psm 7")
        code = pytesseract.image_to_string(self.binarize(imgCaptcha), config="-psm 7")
        self.logger.debug("OCR识别结果: %s" % code)
        return code

    def binarize(self, imgCaptcha):
        """将图像二值化（黑白）
        Args:
            imgCaptcha: PIL Image类型的图像实例
        Returns:
            PIL Image类型的黑白图像
        """

        # 转换成灰度图
        image = imgCaptcha.convert('L')

        # 创建二值化映射表
        threshold = 130
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
        # 二值化
        return image.point(table, '1')


def main():
    logging.config.fileConfig("config/logging.config")
    captchaFile = "验证码图片路径"
    image = Image.open(captchaFile)

    deCaptcha = DeCaptcha()
    deCaptcha.crack(image)
    pass


if __name__ == "__main__":
    main()

Tesseract的文档说它在进行OCR之前会先做一次二值化预处理，但我感觉好像并非如此，以为如果我在上面的代码中不自己先做一遍二值化，识别出来的结果明显有差。=。=#

另外，tesseract可以通过训练来增强识别水平，但我试了大半天没成功（文档太少了。。。）遂放弃。

pytesseract识别验证码

1. 网页截图并截取验证码

2. 对验证码图片进行OCR识别（包括预处理）

2 thoughts on “pytesseract识别验证码”

Leave a Comment Cancel Reply