用到的库为Requests,bs4,re,pyMySQL
目的是将链接存入数据库,数据库分为三张表:
category:存储分类
video:存储视屏的信息
link:存储链接
因为该网站网页不规范,所以用到了很多判断、正则来获取正确链接,但估计依然有没爬取到的,也就这样吧。不过这个网站页面结构相对简单,感觉很适合练手
结果共爬取28000+视屏,14w+链接,爬了几十个小时(我在中间有sleep),下面是所有代码import time
import requests
fROM bs4 import BeautifulSoup
import re
import pymysql
def printTime(timeFloat,hasFinish=''):
timeInt = round(timeFloat)
timeHour = timeInt // (60 * 60)
timeMinute = (timeInt - timeHour * 60 * 60) // 60
timeSecond = timeInt - timeMinute * 60 - timeHour * 60 * 60
if timeInt < 60:
print('已用时:' + str(timeInt) + ' s',end='')
elif timeHour < 1:
print('已用时:' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')
else:
print('已用时:' + str(timeHour) + ' h ' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')
print(' 完成度'+hasFinish)
class eachVideo:
def __init__(self):
selfinsertVideo = 'insert into video (name,title,cid) values (%s,%s,%s)'
selfSELECTVideo = 'select id from video where title=%s'
selfconnect(user='root', password='199508', DATabase='myresource')
return db
# 获取soup对象
def getSoup(self, URL):
headers = {'user-agent': 'Mozilla/50; WOW64) AppleWEBKit/5370132 Safari/5379,image/webp,image/apng,*/*;q=021542004912; gr_user_id=b54b3fbb-3005-4021-9191-49961f0925e5; '
'_gid=GA1497245283get(url, headers=headers, timeout=2)
try:
html = responsedecode('gb2312')
except UnicodeDecodeERROR:
html = responsedecode('gbk')
soup = BeautifulSoup(html, 'lxml')
return soup
# 获取标题
def getTitle(self, soup):
title = souptext
return title
# 获取名字
def getName(self, title):
try:
name = re*?)》', title)find('img')get(imageLink, timeout=1)
path = 'E:\图片\dy2018\\' + str(vid) + 'write(imagefind_all('td')
links = []
for l in link:
try:
if re*?', llower()) is not None:
linkstextgetDB()
cursor = dbgetSoup(url)
title = selfgetName(title)
cursorselectVideo, title)
titleDB = cursorexecute(selfstrip(), titlelastrowid
links = selfexecutemany(selfgetPic(soup, name, vid)
else:
print(' ' + name + '---无法获取链接')
dbmit()
dbgetSoup(cateUrl)
urls = soupget('href')
if re*?\append(trueUrl)
return trueUrls
# 获取分类所有页面链接
def getEveryVideoLinks(self, cateUrl):
soup = selftext
pageCount = re*?\d*?/(\d*)', soup)append('_' + str(i))
everyTrueUrls = []
for num in pageNums:
url = cateUrl + '/index' + num + 'getEachVideoLinks(url)
print(url + '获取页面链接成功')
except Exception:
try:
everyTrueUrls += selfappend(str(i))
categorysappend('html/tv/hepai')
categorysappend('html/tv/oumeitv')
categorysappend('html/zongyi2013')
categorysappend('html/dongman')
categorysappend('html/3gp')
return categorys
# 获取所有分类的所有页面的所有连接
def getAllVideoLink(self, categorys):
timeBegin = timedy2018getSoup(url)
except Exception:
try:
soup = selffind('h2')search('>(group(1)find('h2')text
# else:
titleAll = soupfind_all('a')
categoryTitle = titleAll[1]getDB()
cursor = dbexecute(selfstrip())
ca = cursorexecute(selfstrip())
else:
print(categoryTitle + ' 已 存 在')
cid = cursormit()
dbgetEveryVideoLinks(url)
except Exception:
try:
everyUrls = selftime()
printTime(timeGetUrls - timeBegin)
for everyUrl in everyUrls:
videoUrl = 'Https:///' + everyUrl
try:
selfexecute(videoUrl, cid)
except Exception as e:
print(e)
continue
timeFinishOne = timeindex(everyUrl)+1)+' / '+str(len(everyUrls))
printTime(timeFinishOne - timeBegin,hasFinish)
# time7)
print('-------------------------' + categoryTitle + '已完成----------------------------')
if __name__ == '__main__':
video = eachVideo()
categorys = videogetAllVideoLink(categorys)
相关阅读
不少朋友觉得每次登录win10系统的时候都需要输入登录密码很麻烦,有没有win10取消登陆密码的方法呢?别着急,方法肯定是有的,下面小编就
一、变量的概念
变量名只有在第一次出现的时候,才是定义变量。当再次出现时,不是定义变量,而是直接使用之前定义的变量。
1pip
echo """
[global]
trusted-host=mirrors
index-url=http://mirrors.aliyun
bisect是python内置模块,用于有序序列的插入和查找。
查找: bisect(array, item)
插入: insort(array,item)
查找
import bisect
DEM除了包括地面高程信息外,还可以派生地貌特性,包括坡度、坡向等,还可以计算地形特征参数,包括山峰、山脊、平原、位面、河道和沟谷