เก็บข้อมูลเว็บด้วยวิธี web crawling ใน Python 3

สวัสดีทุกท่านครับ ในการรวบรวมข้อมูลบางอย่าง เช่น ข้อมูลเบอร์โทรศัพท์ เป็นต้น หากข้อมูลเหล่านั้นอยู่บนเว็บไซต์แล้วคุณต้องการข้อมูลนั้น คุณแค่คัดลอกมา แต่ถ้ามีข้อมูลจำนวนหลายหน้า คงไม่ไหวที่จะมานั่งคัดลอกเองแน่ บทความนี้จะพาผู้อ่านไปเก็บข้อมูลเว็บด้วยวิธี web crawling ใน Python 3
web crawling เป็น bot ชนิดหนึ่งที่อยู่บนอินเทอร์เน็ต ทำหน้าที่จัดทำดัชนีเว็บ มีอีกชื่อหนึ่งเรียกว่า Web spider
ในภาษา Python 3 ได้มีนักพัฒนาได้พัฒนาโมดูล web crawling ออกมา ผมขอแนะนำโมดูล Pomp
ใช้งานง่าย และสามารถกำหนดเนื้อหาที่ต้องการได้ เป็น BSD license
ในการติดตั้งโมดูลนี้ ผมแนะนำให้ผู้อ่านโหลดโค้ดมาติดตั้งเองครับ จาก https://bitbucket.org/estin/pomp/downloads คลิก Download repository แล้วแตกไฟล์ เปิดคอมมานด์ไลน์เข้าไปที่โฟลเดอร์ที่แตกไฟล์ไว้แล้วใช้คำสั่ง python setup.py install ดูเพิ่มเติมการติดตั้งโมดูลใน Python
ตัวอย่างการใช้ web crawling
ค้นหาคำว่า python จากเว็บ http://python.org/news

import re
from pomp.core.base import BaseCrawler


python_sentence_re = re.compile('[\w\s]{0,}python[\s\w]{0,}', re.I | re.M)


class MyCrawler(BaseCrawler):
    """Extract all sentences with `python` word"""
    ENTRY_REQUESTS = 'http://python.org/news'  # อ้างอิงหน้าเว็บที่ต้องการ crawl

    def extract_items(self, response):
        for i in python_sentence_re.findall(response.body.decode('utf-8')): # ถอดหน้าเว็บเป็น UTF 8
            item = i.strip()
            print(item)
            return item

    def next_requests(self, response):
        return None  # crawler เพจเดียว, แล้วหยุด crawl ต่อ


if __name__ == '__main__':
    from pomp.core.engine import Pomp
    from pomp.contrib.urllibtools import UrllibDownloader

    pomp = Pomp(
        downloader=UrllibDownloader(),
    )

    pomp.pump(MyCrawler())

ผลลัพธ์
python

ดึงข่าวสารจาก https://python.org/news/

"""
Extract python news from python.org
"""
import sys
import re
import logging
from pomp.core.base import BaseCrawler, BasePipeline
from pomp.core.item import Item, Field
from pomp.contrib.urllibtools import UrllibDownloader


logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
news_re = re.compile(
    r'
(.*?)
([\s\S]*?)
(.*?)
')


class PythonNewsItem(Item):
    title = Field()
    published = Field()

    def __repr__(self):
        return '%s\n\t%s\n' % (
            self.title,
            self.published,
        )


class PythonNewsCrawler(BaseCrawler):
    ENTRY_REQUESTS = 'https://python.org/news/'

    def extract_items(self, response):
        for i in news_re.findall(response.body.decode('utf-8')):
            item = PythonNewsItem()
            item.title, item.published = i[0], i[2]
            yield item

    def next_requests(self, response):
        return None  # one page crawler


class PrintPipeline(BasePipeline):

    def process(self, crawler, item):
        print(item)
        return item


if __name__ == '__main__':
    from pomp.core.engine import Pomp

    pomp = Pomp(
        downloader=UrllibDownloader(),
        pipelines=[PrintPipeline()],
    )

    pomp.pump(PythonNewsCrawler())
input()

ผลลัพธ์

เก็บข้อมูลเว็บด้วยวิธี web crawling ใน Python 3

อ่านเอกสารการใช้งานโมดูลได้ที่ https://pomp.readthedocs.org/
หน้าเว็บโมดูล pomp https://bitbucket.org/estin/pomp
ติดตามบทความต่อไปนะครับ
ขอบคุณครับ

0 ความคิดเห็น:

แสดงความคิดเห็น

แสดงความคิดเห็นได้ครับ :)

Python 3

สอนเขียนโปรแกรมด้วยภาษา Python 3

31 ธันวาคม 2557

เก็บข้อมูลเว็บด้วยวิธี web crawling ใน Python 3

(.*?)

0 ความคิดเห็น:

แสดงความคิดเห็น

Search

Popular Posts

Categories

Blog Archive

License

ค้นหาบล็อกนี้

หนังสือ Python ภาษาไทย ออนไลน์