不学网

 找回密码
 立即注册

只需一步,快速开始

手机号码,快捷登录

查看: 152|回复: 0

[python] 原创网盘爬虫

[复制链接]
rain1994 发表于 2018-6-9 11:21:35 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。

您需要 登录 才可以下载或查看,没有帐号?立即注册

x
本帖最后由 rain1994 于 2018-6-9 11:22 编辑

针对的网站有输入验证码操作,所以用了延时请求。只爬取了第一页内容
  1. # -*- coding: UTF-8 -*-
  2. import urllib.request
  3. import zlib
  4. import re
  5. import time
  6. import chardet

  7. def getfilepath(text):
  8.     req = urllib.request.Request(text)
  9.     time.sleep( 5 )
  10.     response= urllib.request.urlopen(req)
  11.     data =response.read()
  12.     html = data
  13.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  14.     if gzipped:
  15.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  16.     html=html.decode()
  17.     # text = re.findall(r"<a.*?>点击去百度云盘下载资源</a>", html)
  18.     # print(text)
  19.     res_url = r"(?<=<a target="_blank" class="dbutton2" href=").+?(?=")|(?<=href=\').+?(?=\')"
  20.     link = re.findall(res_url ,  html, re.I|re.S|re.M)
  21.     url=''
  22.     for url in link:
  23.         print(url)

  24.     req = urllib.request.Request(url)
  25.     time.sleep( 5 )
  26.     response= urllib.request.urlopen(req)
  27.     data =response.read()
  28.     html = data
  29.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  30.     if gzipped:
  31.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  32.     html=html.decode()
  33.     res_url = r"(?<=<a href=").+?(?=")|(?<=href=\').+?(?=\')"
  34.     link = re.findall(res_url ,  html, re.I|re.S|re.M)
  35.     url=''
  36.     for url in link:
  37.         print(url)

  38.     req = urllib.request.Request(url)
  39.     response= urllib.request.urlopen(req)
  40.     data =response.read()
  41.     html = data
  42.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  43.     if gzipped:
  44.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  45.     html=html.decode()
  46.     # print(html)
  47.     if "/box-static/disk-share/widget/pageModule/error/img/errorImg_b3a6155.png" in html:
  48.         print("分享链接无效")
  49.         url=''
  50.     return url
  51. def gethtml(text):
  52.     print(text)
  53.     url = 'http://www.panduoduo.net/'
  54.     text = urllib.parse.quote(text)
  55.     print(text)
  56.     data = "s/name/" + text
  57.     print(data)
  58.     url=url+data
  59.     print(url)
  60.     # data = bytes(data,encoding = 'utf-8')
  61.     req = urllib.request.Request(url)
  62.     time.sleep( 5 )
  63.     response= urllib.request.urlopen(req)
  64.     data =response.read()
  65.     html = data
  66.     gzipped = response.headers.get('Content-Encoding')#查看是否服务器是否支持gzip
  67.     if gzipped:
  68.           html = zlib.decompress(html, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  69.     # html = zlib.decompress(data, 16+zlib.MAX_WBITS)#解压缩,得到网页源码
  70.     # type = chardet.detect(html).get('encoding','utf-8')##通过第3方模块来自动提取网页的编码
  71.     # htmlCode_encode = html.decode('utf-8').encode(type)
  72.     # print(html.decode())
  73.     with open('1.html', 'wb') as f:
  74.         f.write(html)
  75.     return html.decode()
  76. text = str(input('Please enter search data:'))
  77. text=gethtml(text)
  78. text = re.findall(r"<h3.*?>.*?</h3>", text)
  79. dict={}
  80. for t in text:
  81.     # print(t)
  82.     res_url = r"(?<=href=").+?(?=")|(?<=href=\').+?(?=\')"
  83.     link = re.findall(res_url ,  t, re.I|re.S|re.M)
  84.     url=''
  85.     for url in link:
  86.         url='http://www.panduoduo.net'+url
  87.         print(url)

  88.     k = re.sub("<h3>.*<a.*?>", "", t)
  89.     k = re.sub("</a>.*</h3>", "", k)
  90.     dict.setdefault(k,url)

  91. print(dict)
  92. for key in dict.keys():
  93.        print(key+':'+dict[key])
  94.        dict[key]=getfilepath(dict[key])
  95.        with open('2.html', 'a') as f:
  96.             f.write(key+':'+dict[key]+'\n')

复制代码



bb.png
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|手机版|小黑屋|不学网

GMT+8, 2018-6-21 16:45

Powered by Discuz! X3.4

© 2001-2017 Comsenz Inc.

快速回复 返回顶部 返回列表