°

用python爬虫采集discuz论坛数据

PHP实战视频教程大全

打算做一个论坛,可是论坛数据量大如果全靠自己手动发布工作了就太大了

所以就想用python写一个入门级的爬虫,其实爬虫写好了才发现,写爬虫采集数据非常简单,难的是往discuz里面插入数据

因为涉及到的表太多了,瞬间我就不想用了,我们可以自己写一个博客,把采集到的数据插入进去,这也是一个解决方案不是吗

总而言之爬虫写好了,分享给大家,也希望大家可以支持童攀课堂,在这里我们会经常发布一些有技术含量的文章:

from bs4 import BeautifulSoup
import requests
import re
import time
import pymysql  

def dbsql(title,content):
    db= pymysql.connect(host="localhost",user="root",  
    password="root",db="blog",port=3306)
    cur = db.cursor()   
    sql_insert ="insert into test(title,content) values('"+ title +"','"+ content +"')"
    try:  
        cur.execute(sql_insert)  
        #提交  
        db.commit()  
    except Exception as e:  
        #错误回滚  
        db.rollback()   
    finally:  
        db.close()  

def crawl_detail(aurl):
    url = aurl
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'lxml')
    neirong = soup.find('div',{'class':'t_fsz'})
    title = soup.find('span',{'id':'thread_subject'})
    soup.d
    title = title.text
    content = neirong.text
    content = content.replace('\\', '')  
    content = content.replace("'", '')  
    print(content)
    print('----------------------------------------------')
    dbsql(title,content)
    # print(title)
    time.sleep(10)


def main():
    # urls = []
    for x in range(1, 3):
        src = 'http://club.topsage.com/forum-481-{}.html'.format(x)
        req = requests.get(src)
        soup = BeautifulSoup(req.content, 'lxml')
        sg_urls = soup.find_all('a',{'class':'s xst'})
        # sg_url = req.content
        # urls.extend = sg_urls
        # print(sg_urls)
        for url in sg_urls:
            aurl = url.get('href')
            if(aurl == 'thread-1064-1-1.html'):
                continue
            if(aurl == 'thread-2177-1-1.html'):
                continue
            if(aurl == 'thread-801-1-1.html'):
                continue
            crawl_detail(aurl)

        time.sleep(3)
        print('-------------------')


if __name__ == '__main__':
    main()
    # dbsql('a','b')
打赏
  喜欢