-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path721pc.py
More file actions
105 lines (93 loc) · 3.05 KB
/
721pc.py
File metadata and controls
105 lines (93 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
# from urllib.request import urlopen
import re
import csv
# import random
def get_html(url):
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
html = driver.page_source
# print(html)
driver.close()
return html
def get_menu(html):
soup = BeautifulSoup(html)
urls_soup = soup.find_all('a', {'data-src': re.compile("^/goodsbycategory?")})
urls = []
for l in urls_soup:
urls.append([l.string, l.attrs['data-src']])
# print(urls)
return urls
def menu_spider(html):
soup = BeautifulSoup(html)
# print(soup)
list_info = soup.find_all('div', {'class': re.compile("^pro-item m-tag-a ")})
list_good = []
for li in list_info:
list_title = li.find('p', {'class': 'pro-info'}).string.strip()
list_desc = li.find('p', {'class': 'pro-desc'}).string.strip()
pro_price = li.find('p', {'class': 'pro-price'}).stripped_strings
mid_price = []
# list_price = []
for string in pro_price:
mid_price.append(string)
# list_price.append(" ".join(mid_price))
list_price = " ".join(mid_price)
list_good.append([list_title, list_desc, list_price])
# print(list_good)
return list_good
def good_spider(html):
# print(html)
soup = BeautifulSoup(html)
list_good = []
if soup.find('div', {'class': 'sku-container'}) != None and soup.find('div', {'class': 'summary'}) != None:
list_title = soup.find('div', {'class': 'good-name fl'}).string.strip()
# print(list_title)
list_desc = soup.find('div', {'class': 'summary'}).string.strip()
list_price = soup.find('span', {'class': 'value'}).string.strip()
list_good.append(list_title)
list_good.append(list_desc)
list_good.append(list_price)
# else:
# print(html)
return list_good
def do_csv(list_goods):
with open('data2.csv', 'a', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
fieldnames = ['name', 'desc', 'price']
writer.writerow(fieldnames)
for good in list_goods:
writer.writerow(good)
basic_url = 'https://youpin.mi.com'
item = '/detail?gid=%s'
id = 1
list_goods = []
while id < 2826:
url = basic_url + item % id
html = get_html(url)
list_good = good_spider(html)
if list_good != []:
list_goods.append(list_good)
print(list_good)
print(id)
id = id + 1
# print(list_goods)
# do_csv(list_goods)
# urls = get_urls(basic_url)
# urls.pop(10)
# # url_info = basic_url + str(urls[1])
# # main_spider(url_info)
# for l in urls:
# url_info = basic_url+l[1]
# list_good = main_spider(url_info)
# if list_good != None and len(list_good) >= 1:
# print(l[0] + '爬取成功')
# else:
# print(l[0] + 'failed')
# do_csv(list_good)
# # print(list_good)
# print('写入完成')