360影视大全 python_「www.dy2018.com」python爬取电影天堂（www.dy2018.com）所有视屏的所有链接

用到的库为Requests,bs4,re,pyMySQL

目的是将链接存入数据库，数据库分为三张表：

category：存储分类

video：存储视屏的信息

link：存储链接

因为该网站网页不规范，所以用到了很多判断、正则来获取正确链接，但估计依然有没爬取到的，也就这样吧。不过这个网站页面结构相对简单，感觉很适合练手

结果共爬取28000+视屏，14w+链接，爬了几十个小时(我在中间有sleep)，下面是所有代码import time

import requests

fROM bs4 import BeautifulSoup

import re

import pymysql

def printTime(timeFloat,hasFinish=''):

timeInt = round(timeFloat)

timeHour = timeInt // (60 * 60)

timeMinute = (timeInt - timeHour * 60 * 60) // 60

timeSecond = timeInt - timeMinute * 60 - timeHour * 60 * 60

if timeInt < 60:

print('已用时：' + str(timeInt) + ' s',end='')

elif timeHour < 1:

print('已用时：' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')

else:

print('已用时：' + str(timeHour) + ' h ' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')

print(' 完成度'+hasFinish)

class eachVideo:

def __init__(self):

selfinsertVideo = 'insert into video (name,title,cid) values (%s,%s,%s)'

selfSELECTVideo = 'select id from video where title=%s'

selfconnect(user='root', password='199508', DATabase='myresource')

return db

# 获取soup对象

def getSoup(self, URL):

headers = {'user-agent': 'Mozilla/50; WOW64) AppleWEBKit/5370132 Safari/5379,image/webp,image/apng,*/*;q=021542004912; gr_user_id=b54b3fbb-3005-4021-9191-49961f0925e5; '

'_gid=GA1497245283get(url, headers=headers, timeout=2)

try:

html = responsedecode('gb2312')

except UnicodeDecodeERROR:

html = responsedecode('gbk')

soup = BeautifulSoup(html, 'lxml')

return soup

# 获取标题

def getTitle(self, soup):

title = souptext

return title

# 获取名字

def getName(self, title):

try:

name = re*?)》', title)find('img')get(imageLink, timeout=1)

path = 'E:\图片\dy2018\\' + str(vid) + 'write(imagefind_all('td')

links = []

for l in link:

try:

if re*?', llower()) is not None:

linkstextgetDB()

cursor = dbgetSoup(url)

title = selfgetName(title)

cursorselectVideo, title)

titleDB = cursorexecute(selfstrip(), titlelastrowid

links = selfexecutemany(selfgetPic(soup, name, vid)

else:

print(' ' + name + '---无法获取链接')

dbmit()

dbgetSoup(cateUrl)

urls = soupget('href')

if re*?\append(trueUrl)

return trueUrls

# 获取分类所有页面链接

def getEveryVideoLinks(self, cateUrl):

soup = selftext

pageCount = re*?\d*?/(\d*)', soup)append('_' + str(i))

everyTrueUrls = []

for num in pageNums:

url = cateUrl + '/index' + num + 'getEachVideoLinks(url)

print(url + '获取页面链接成功')

except Exception:

try:

everyTrueUrls += selfappend(str(i))

categorysappend('html/tv/hepai')

categorysappend('html/tv/oumeitv')

categorysappend('html/zongyi2013')

categorysappend('html/dongman')

categorysappend('html/3gp')

return categorys

# 获取所有分类的所有页面的所有连接

def getAllVideoLink(self, categorys):

timeBegin = timedy2018getSoup(url)

except Exception:

try:

soup = selffind('h2')search('>(group(1)find('h2')text

# else:

titleAll = soupfind_all('a')

categoryTitle = titleAll[1]getDB()

cursor = dbexecute(selfstrip())

ca = cursorexecute(selfstrip())

else:

print(categoryTitle + ' 已存在')

cid = cursormit()

dbgetEveryVideoLinks(url)

except Exception:

try:

everyUrls = selftime()

printTime(timeGetUrls - timeBegin)

for everyUrl in everyUrls:

videoUrl = 'Https:///' + everyUrl

try:

selfexecute(videoUrl, cid)

except Exception as e:

print(e)

continue

timeFinishOne = timeindex(everyUrl)+1)+' / '+str(len(everyUrls))

printTime(timeFinishOne - timeBegin,hasFinish)

# time7)

print('-------------------------' + categoryTitle + '已完成----------------------------')

if __name__ == '__main__':

video = eachVideo()

categorys = videogetAllVideoLink(categorys)

360影视大全 python_「www.dy2018.com」python爬取电影天堂（www.dy2018.com）所有视屏的所有链接 - 金橙教程网..._在线天堂www

相关推荐