I used python to write a crawler that crawls IP addresses. Since the website is anti-crawler, I wrote an agent
Use the thread pool to open 10 threads to crawl IP addresses
However, an error is reported directly 'list ' object has no attribute 'get'
I don’t know how to solve it, so I’ll post my code.
from bs4 import BeautifulSoup
import requests
import re
import time
from multiprocessing import Pool
import pymysql
import random
from threadpool import *
def randHeader():
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
header = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return header
def randproxy():
config = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': '',
'db': 'autohome',
'charset': 'utf8',
# 'cursorclass': pymysql.cursors.DictCursor,
}
# 創(chuàng)建連接
list_ip = []
connection = pymysql.connect(**config)
cursor = connection.cursor()
sql = 'select ip,port from can_use'
try:
cursor.execute(sql)
results = cursor.fetchall()
for row in results:
data = {
'ip': row[0],
'port': row[1]
}
list_ip.append(data)
except:
print("error")
# time.sleep(1)
finally:
connection.close()
return random.choice(list_ip)
def download(url):
proxy = randproxy()
proxy_host = "http://" + proxy['ip'] + ":" + proxy['port']
proxy_temp = {"http": proxy_host}
parse_url = requests.get(url[0],headers=randHeader(),timeout=12,proxies=proxy_temp)
soup = BeautifulSoup(parse_url.text,'lxml')
pre_proxys = soup.find('table', id='ip_list').find_all('tr')
for i in pre_proxys[1:]:
try:
td = i.find_all('td')
id = td[1].get_text()
port = td[2].get_text()
# 執(zhí)行sql語(yǔ)句
config = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': '',
'db': 'autohome',
'charset': 'utf8',
'cursorclass': pymysql.cursors.DictCursor,
}
# 創(chuàng)建連接
connection = pymysql.connect(**config)
data = {
'ip':id,
'port':port,
}
with connection.cursor() as cursor:
# 執(zhí)行sql語(yǔ)句,插入記錄
sql = 'INSERT INTO proxyip (ip,port) VALUES (%s,%s)'
cursor.execute(sql, (data['ip'],data['port']))
# 沒(méi)有設(shè)置默認(rèn)自動(dòng)提交,需要主動(dòng)提交,以保存所執(zhí)行的語(yǔ)句
connection.commit()
except:
print("error")
# time.sleep(1)
finally:
connection.close()
time.sleep(2)
def proxy_url_list():
url = "http://www.xicidaili.com/wt/{}"
url_list = []
for i in range(1,1387):
new_url = url.format(i)
url_list.append(new_url)
return url_list
if name == "__main__":
pool = ThreadPool(2)
requests = makeRequests(download,proxy_url_list())
[pool.putRequest(req) for req in requests]
pool.wait()
# url = "http://www.baidu.com"
# proxy = randproxy()
# proxy_host = "http://" + proxy['ip'] + ":" + proxy['port']
# proxy_temp = {"http": proxy_host}
# test = requests.get(url,headers=randHeader(),timeout=10,proxies=proxy_temp)
# soup = BeautifulSoup(test.text,'lxml')
# print(soup)
圖片發(fā)布不了,現(xiàn)在只能貼上錯(cuò)誤提示了
File "C:\Python\lib\site-packages\threadpool.py", line 158, in run
result = request.callable(*request.args, **request.kwds)
File "C:/qichezhijia/proxyspider.py", line 80, in download
parse_url = requests.get(url[0],headers=randHeader(),timeout=12,proxies=proxy_temp)
AttributeError: 'list' object has no attribute 'get'
Traceback (most recent call last):
File "C:Pythonlibsite-packagesthreadpool.py", line 158, in run
result = request.callable(*request.args, **request.kwds)
File "C:/qichezhijia/proxyspider.py", line 80, in download
parse_url = requests.get(url[0],headers=randHeader(),timeout=12,proxies=proxy_temp)
AttributeError: 'list' object has no attribute 'get'
Traceback (most recent call last):
File "C:Pythonlibsite-packagesthreadpool.py", line 158, in run
result = request.callable(*request.args, **request.kwds)
File "C:/qichezhijia/proxyspider.py", line 80, in download
parse_url = requests.get(url[0],headers=randHeader(),timeout=12,proxies=proxy_temp)
AttributeError: 'list' object has no attribute 'get'
Traceback (most recent call last):
File "C:Pythonlibsite-packagesthreadpool.py", line 158, in run
result = request.callable(*request.args, **request.kwds)
File "C:/qichezhijia/proxyspider.py", line 80, in download
parse_url = requests.get(url[0],headers=randHeader(),timeout=12,proxies=proxy_temp)
AttributeError: 'list' object has no attribute 'get'
Traceback (most recent call last):
File "C:Pythonlibsite-packagesthreadpool.py", line 158, in run
result = request.callable(*request.args, **request.kwds)
File "C:/qichezhijia/proxyspider.py", line 80, in download
parse_url = requests.get(url[0],headers=randHeader(),timeout=12,proxies=proxy_temp)
AttributeError: 'list' object has no attribute 'get'
What does makeRequests do? Have you assigned requests to a list type? Then requests.get(*) will naturally go wrong.
MakeRequests is similar to Python’s map function. It has two parameters (function, list()), and the entries in the list are supplied to the previous function for operation...
The requests inside are the methods of the requests module, maybe they have the same name. , or maybe the writing of url[0] is wrong, please go back and debug it later...
The name is the same, I suggest this line
requests = makeRequests(download,proxy_url_list())
[pool.putRequest(req) for req in requests]
Change to
firstmyrequests = makeRequests(download,proxy_url_list())
[pool.putRequest(req) for req in myrequests]
Try again