不学网

 找回密码
 立即注册

只需一步,快速开始

手机号码,快捷登录

查看: 483|回复: 1

[python] 原创网盘爬虫

[复制链接]
rain1994 发表于 2018-6-9 11:21:35 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。

您需要 登录 才可以下载或查看,没有帐号?立即注册

x
本帖最后由 rain1994 于 2018-6-9 11:22 编辑

针对的网站有输入验证码操作,所以用了延时请求。只爬取了第一页内容
  1. # -*- coding: UTF-8 -*-
  2. import urllib.request
  3. import zlib
  4. import re
  5. import time
  6. import chardet

  7. def getfilepath(text):
  8.     req = urllib.request.Request(text)
  9.     time.sleep( 5 )
  10.     response= urllib.request.urlopen(req)
  11.     data =response.read()
  12.     html = data
  13.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  14.     if gzipped:
  15.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  16.     html=html.decode()
  17.     # text = re.findall(r"<a.*?>点击去百度云盘下载资源</a>", html)
  18.     # print(text)
  19.     res_url = r"(?<=<a target="_blank" class="dbutton2" href=").+?(?=")|(?<=href=\').+?(?=\')"
  20.     link = re.findall(res_url ,  html, re.I|re.S|re.M)
  21.     url=''
  22.     for url in link:
  23.         print(url)

  24.     req = urllib.request.Request(url)
  25.     time.sleep( 5 )
  26.     response= urllib.request.urlopen(req)
  27.     data =response.read()
  28.     html = data
  29.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  30.     if gzipped:
  31.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  32.     html=html.decode()
  33.     res_url = r"(?<=<a href=").+?(?=")|(?<=href=\').+?(?=\')"
  34.     link = re.findall(res_url ,  html, re.I|re.S|re.M)
  35.     url=''
  36.     for url in link:
  37.         print(url)

  38.     req = urllib.request.Request(url)
  39.     response= urllib.request.urlopen(req)
  40.     data =response.read()
  41.     html = data
  42.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  43.     if gzipped:
  44.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  45.     html=html.decode()
  46.     # print(html)
  47.     if "/box-static/disk-share/widget/pageModule/error/img/errorImg_b3a6155.png" in html:
  48.         print("分享链接无效")
  49.         url=''
  50.     return url
  51. def gethtml(text):
  52.     print(text)
  53.     url = 'http://www.panduoduo.net/'
  54.     text = urllib.parse.quote(text)
  55.     print(text)
  56.     data = "s/name/" + text
  57.     print(data)
  58.     url=url+data
  59.     print(url)
  60.     # data = bytes(data,encoding = 'utf-8')
  61.     req = urllib.request.Request(url)
  62.     time.sleep( 5 )
  63.     response= urllib.request.urlopen(req)
  64.     data =response.read()
  65.     html = data
  66.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  67.     if gzipped:
  68.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  69.     # html = zlib.decompress(data, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  70.     # type = chardet.detect(html).get('encoding','utf-8')##通过第3方模块来自动提取网页的编码
  71.     # htmlCode_encode = html.decode('utf-8').encode(type)
  72.     # print(html.decode())
  73.     with open('1.html', 'wb') as f:
  74.         f.write(html)
  75.     return html.decode()
  76. text = str(input('Please enter search data:'))
  77. text=gethtml(text)
  78. text = re.findall(r"<h3.*?>.*?</h3>", text)
  79. dict={}
  80. for t in text:
  81.     # print(t)
  82.     res_url = r"(?<=href=").+?(?=")|(?<=href=\').+?(?=\')"
  83.     link = re.findall(res_url ,  t, re.I|re.S|re.M)
  84.     url=''
  85.     for url in link:
  86.         url='http://www.panduoduo.net'+url
  87.         print(url)

  88.     k = re.sub("<h3>.*<a.*?>", "", t)
  89.     k = re.sub("</a>.*</h3>", "", k)
  90.     dict.setdefault(k,url)

  91. print(dict)
  92. for key in dict.keys():
  93.        print(key+':'+dict[key])
  94.        dict[key]=getfilepath(dict[key])
  95.        with open('2.html', 'a') as f:
  96.             f.write(key+':'+dict[key]+'\n')

复制代码



bb.png
回复

使用道具 举报

 楼主| rain1994 发表于 2018-6-30 20:10:51 | 显示全部楼层
之前的代码不能用了,要伪装python的请求,添加请求头信息。

  1. # -*- coding: UTF-8 -*-
  2. import urllib.request
  3. import zlib
  4. import re
  5. import time
  6. import chardet

  7. def getfilepath(text):
  8.     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
  9.                         'Chrome/51.0.2704.63 Safari/537.36'}
  10.     req = urllib.request.Request(text,headers=headers)
  11.     time.sleep( 5 )

  12.     response= urllib.request.urlopen(req)
  13.     data =response.read()
  14.     html = data
  15.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  16.     if gzipped:
  17.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  18.     html=html.decode()
  19.     # text = re.findall(r"<a.*?>点击去百度云盘下载资源</a>", html)
  20.     # print(text)
  21.     res_url = r"(?<=<a target="_blank" class="dbutton2" href=").+?(?=")|(?<=href=\').+?(?=\')"
  22.     link = re.findall(res_url ,  html, re.I|re.S|re.M)
  23.     url=''
  24.     for url in link:
  25.         print(url)

  26.     req = urllib.request.Request(url)
  27.     time.sleep( 5 )
  28.     response= urllib.request.urlopen(req)
  29.     data =response.read()
  30.     html = data
  31.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  32.     if gzipped:
  33.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  34.     html=html.decode()
  35.     res_url = r"(?<=<a href=").+?(?=")|(?<=href=\').+?(?=\')"
  36.     link = re.findall(res_url ,  html, re.I|re.S|re.M)
  37.     url=''
  38.     for url in link:
  39.         print(url)

  40.     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
  41.                         'Chrome/51.0.2704.63 Safari/537.36'}
  42.     req = urllib.request.Request(url,headers=headers)
  43.     response= urllib.request.urlopen(req)
  44.     data =response.read()
  45.     html = data
  46.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  47.     if gzipped:
  48.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  49.     html=html.decode()
  50.     # print(html)
  51.     if "/box-static/disk-share/widget/pageModule/error/img/errorImg_b3a6155.png" in html:
  52.         print("分享链接无效")
  53.         url=''
  54.     return url
  55. def gethtml(text):
  56.     print(text)
  57.     url = 'http://www.panduoduo.net/'
  58.     text = urllib.parse.quote(text)
  59.     print(text)
  60.     data = "s/name/" + text
  61.     print(data)
  62.     url=url+data
  63.     print(url)
  64.     # data = bytes(data,encoding = 'utf-8')
  65.     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
  66.                         'Chrome/51.0.2704.63 Safari/537.36'}
  67.     req = urllib.request.Request(url,headers=headers)
  68.     time.sleep( 5 )
  69.     response= urllib.request.urlopen(req)
  70.     data =response.read()
  71.     html = data
  72.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  73.     if gzipped:
  74.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  75.     # html = zlib.decompress(data, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  76.     # type = chardet.detect(html).get('encoding','utf-8')##通过第3方模块来自动提取网页的编码
  77.     # htmlCode_encode = html.decode('utf-8').encode(type)
  78.     # print(html.decode())
  79.     with open('1.html', 'wb') as f:
  80.         f.write(html)
  81.     return html.decode()
  82. text = str(input('Please enter search data:'))
  83. text=gethtml(text)
  84. text = re.findall(r"<h3.*?>.*?</h3>", text)
  85. dict={}
  86. for t in text:
  87.     # print(t)
  88.     res_url = r"(?<=href=").+?(?=")|(?<=href=\').+?(?=\')"
  89.     link = re.findall(res_url ,  t, re.I|re.S|re.M)
  90.     url=''
  91.     for url in link:
  92.         url='http://www.panduoduo.net'+url
  93.         print(url)

  94.     k = re.sub("<h3>.*<a.*?>", "", t)
  95.     k = re.sub("</a>.*</h3>", "", k)
  96.     dict.setdefault(k,url)

  97. print(dict)
  98. for key in dict.keys():
  99.        print(key+':'+dict[key])
  100.        dict[key]=getfilepath(dict[key])
  101.        with open('2.html', 'a') as f:
  102.             f.write(key+':'+dict[key]+'\n')

复制代码

回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|手机版|小黑屋|不学网

GMT+8, 2018-8-20 01:33

Powered by Discuz! X3.4

© 2001-2017 Comsenz Inc.

快速回复 返回顶部 返回列表