pytesseract识别验证码

验证码破解是做爬虫经常要面对的问题。对于一般字符串或者算式的验证码,可以使用网页截图,然后OCR识别的方式来识别出验证码字符串来。

  1. 网页截图可以使用selenium的save_screenshot()方法;
  2. 用PIL图像处理库来截取验证码图片;
  3. Tesseract来做OCR识别

要安装的python第三方库有:selenium、pillow(PIL)、pytesseract

要安装的软件有:selenium用的webdriver、pytesseract用的Tesseract

1. 网页截图并截取验证码

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
import selenium
from PIL import Image
...
fileScreenshot = "截屏文件保存路径"
fileCaptcha = "验证码图片保存路径"
self.logger.debug("屏幕截图: %s" % fileScreenshot)
self.webdriver.save_screenshot(fileScreenshot)
# 计算验证码图片的位置大小
imgCaptcha = self.webdriver.find_element_by_id("验证码img标签的id")
loc = imgCaptcha.location
size = imgCaptcha.size
left = int(loc['x'])
top = int(loc['y'])
right = int(loc['x'] + size['width'])
bottom = int(loc['y'] + size['height'])
self.logger.debug("截取验证码: %s" % fileCaptcha)
image = Image.open(fileScreenshot)
image = image.crop((left, top, right, bottom))
image.save(fileCaptcha)
import selenium from PIL import Image ... fileScreenshot = "截屏文件保存路径" fileCaptcha = "验证码图片保存路径" self.logger.debug("屏幕截图: %s" % fileScreenshot) self.webdriver.save_screenshot(fileScreenshot) # 计算验证码图片的位置大小 imgCaptcha = self.webdriver.find_element_by_id("验证码img标签的id") loc = imgCaptcha.location size = imgCaptcha.size left = int(loc['x']) top = int(loc['y']) right = int(loc['x'] + size['width']) bottom = int(loc['y'] + size['height']) self.logger.debug("截取验证码: %s" % fileCaptcha) image = Image.open(fileScreenshot) image = image.crop((left, top, right, bottom)) image.save(fileCaptcha)
import selenium
from PIL import Image

...

fileScreenshot = "截屏文件保存路径"
fileCaptcha = "验证码图片保存路径"

self.logger.debug("屏幕截图: %s" % fileScreenshot)
self.webdriver.save_screenshot(fileScreenshot)


# 计算验证码图片的位置大小
imgCaptcha = self.webdriver.find_element_by_id("验证码img标签的id")
loc = imgCaptcha.location
size = imgCaptcha.size
left = int(loc['x'])
top = int(loc['y'])
right = int(loc['x'] + size['width'])
bottom = int(loc['y'] + size['height'])

self.logger.debug("截取验证码: %s" % fileCaptcha)
image = Image.open(fileScreenshot)
image = image.crop((left, top, right, bottom))
image.save(fileCaptcha)

2. 对验证码图片进行OCR识别(包括预处理)

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
# -*- coding: utf-8 -*-
import logging
import logging.config
import pytesseract
from PIL import Image
class DeCaptcha(object):
"""验证码破解"""
def __init__(self):
"""构造函数"""
super(DeCaptcha, self).__init__()
self.logger = logging.getLogger(self.__class__.__name__)
self.logger.debug("Init DeCaptcha instance")
pass
def crack(self, imgCaptcha):
"""通过OCR识别验证码
Args:
imgCaptcha: PIL Image类型的图像实例
Returns:
识别出来的字符串
"""
#code = pytesseract.image_to_string(imgCaptcha, config="-psm 7")
code = pytesseract.image_to_string(self.binarize(imgCaptcha), config="-psm 7")
self.logger.debug("OCR识别结果: %s" % code)
return code
def binarize(self, imgCaptcha):
"""将图像二值化(黑白)
Args:
imgCaptcha: PIL Image类型的图像实例
Returns:
PIL Image类型的黑白图像
"""
# 转换成灰度图
image = imgCaptcha.convert('L')
# 创建二值化映射表
threshold = 130
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
# 二值化
return image.point(table, '1')
def main():
logging.config.fileConfig("config/logging.config")
captchaFile = "验证码图片路径"
image = Image.open(captchaFile)
deCaptcha = DeCaptcha()
deCaptcha.crack(image)
pass
if __name__ == "__main__":
main()
# -*- coding: utf-8 -*- import logging import logging.config import pytesseract from PIL import Image class DeCaptcha(object): """验证码破解""" def __init__(self): """构造函数""" super(DeCaptcha, self).__init__() self.logger = logging.getLogger(self.__class__.__name__) self.logger.debug("Init DeCaptcha instance") pass def crack(self, imgCaptcha): """通过OCR识别验证码 Args: imgCaptcha: PIL Image类型的图像实例 Returns: 识别出来的字符串 """ #code = pytesseract.image_to_string(imgCaptcha, config="-psm 7") code = pytesseract.image_to_string(self.binarize(imgCaptcha), config="-psm 7") self.logger.debug("OCR识别结果: %s" % code) return code def binarize(self, imgCaptcha): """将图像二值化(黑白) Args: imgCaptcha: PIL Image类型的图像实例 Returns: PIL Image类型的黑白图像 """ # 转换成灰度图 image = imgCaptcha.convert('L') # 创建二值化映射表 threshold = 130 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) # 二值化 return image.point(table, '1') def main(): logging.config.fileConfig("config/logging.config") captchaFile = "验证码图片路径" image = Image.open(captchaFile) deCaptcha = DeCaptcha() deCaptcha.crack(image) pass if __name__ == "__main__": main()
# -*- coding: utf-8 -*-

import logging
import logging.config
import pytesseract
from PIL import Image


class DeCaptcha(object):
    """验证码破解"""
    
    def __init__(self):
        """构造函数"""
        super(DeCaptcha, self).__init__()

        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.debug("Init DeCaptcha instance")
        pass

    def crack(self, imgCaptcha):
        """通过OCR识别验证码
        Args:
            imgCaptcha: PIL Image类型的图像实例
        Returns:
            识别出来的字符串
        """

        #code = pytesseract.image_to_string(imgCaptcha, config="-psm 7")
        code = pytesseract.image_to_string(self.binarize(imgCaptcha), config="-psm 7")
        self.logger.debug("OCR识别结果: %s" % code)
        return code

    def binarize(self, imgCaptcha):
        """将图像二值化(黑白)
        Args:
            imgCaptcha: PIL Image类型的图像实例
        Returns:
            PIL Image类型的黑白图像
        """

        # 转换成灰度图
        image = imgCaptcha.convert('L')

        # 创建二值化映射表
        threshold = 130
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
        # 二值化
        return image.point(table, '1')


def main():
    logging.config.fileConfig("config/logging.config")
    captchaFile = "验证码图片路径"
    image = Image.open(captchaFile)

    deCaptcha = DeCaptcha()
    deCaptcha.crack(image)
    pass


if __name__ == "__main__":
    main()

Tesseract的文档说它在进行OCR之前会先做一次二值化预处理,但我感觉好像并非如此,以为如果我在上面的代码中不自己先做一遍二值化,识别出来的结果明显有差。=。=#

另外,tesseract可以通过训练来增强识别水平,但我试了大半天没成功(文档太少了。。。)遂放弃。

 

2 thoughts on “pytesseract识别验证码”

Leave a Comment

Your email address will not be published. Required fields are marked *