Demo

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import time
from io import BytesIO
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy import Selector # 使用Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
from info.models import Info

domin = 'https://www.tianyancha.com/group/3137567565/79e21454fdd5452eba4e818ce2e3894a'
name = '13810000588'
password = 'wan97uan'

chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-gpu')

browser = webdriver.Chrome(options=chrome_options)
# 定义空列表,用于存放所有的url
url_list = list()
def login():
try:
# 将窗口最大化防止定位错位
browser.maximize_window()
except Exception as e:
pass
browser.get(domin)

# 通过定位元素并使用 click 方法点击登录按钮
login_button = browser.find_element(By.CSS_SELECTOR, ".login")
time.sleep(5)
login_button.click()


wait = WebDriverWait(browser, 10)
element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.toggle_box')))


phone_button = browser.find_element(By.CSS_SELECTOR, ".toggle_box")
phone_button.click()
time.sleep(3)

pass_button = browser.find_element(By.XPATH, "//div[contains(text(), '密码登录')]")
pass_button.click()
time.sleep(2)

username_ele = browser.find_element(By.CSS_SELECTOR, '#mobile')
password_ele = browser.find_element(By.CSS_SELECTOR, '#password')
username_ele.send_keys(name)
password_ele.send_keys(password)

# 在登陆之前首先要阅读同意网站的协议
agree_button = browser.find_element(By.CSS_SELECTOR, "#agreement-checkbox-account")
agree_button.click()
time.sleep(1)

# 点击登录调出滑块验证码
pass_button = browser.find_element(By.XPATH, "//div[@onclick='loginObj.loginByPhone(event);']")
pass_button.click()
time.sleep(2)

# 如果不点击验证码图像出不来
verify_img_pass = browser.find_element(By.CSS_SELECTOR, ".gt_slider_knob")
verify_img_pass.click()
time.sleep(10)

def crop_image(image_file_name):
time.sleep(2)
img = browser.find_element(By.CSS_SELECTOR, '.gt_cut_bg.gt_show')
location = img.location
# 得到图片距离界面左上角,顶部的位置
print("图片的位置:", location)
size = img.size

# localtion会返回一个列表,分别是上,下,左,右
top, buttom, left, right = location["y"], location["y"]+size["height"], location["x"], location["x"]+size["width"]
print("验证码截图坐标:", left, top, buttom, right)
screen_shot = browser.get_screenshot_as_png()
# 将seleinum图片对象转为pillow图片对象
screen_shot = Image.open(BytesIO(screen_shot))
# 获取验证码截图(传入元组类型的坐标)
captcha = screen_shot.crop((int(left), int(top), int(right), int(buttom)))
captcha.save(image_file_name)
return captcha

def get_info():
# 先拿一次
detail = Selector(text=browser.page_source)
# seq = detail.xpath("//tbody/tr/td[0]/text()").extract()[0]
# company_name = detail.xpath("//tbody/tr/td[1]/table/tbody/tr/td[1]/div/a/text()").extract()[0]
# person = detail.xpath("//tbody/tr/td[2]/table/tbody/tr/td[1]/a/text()").extract()[0]
# duty = detail.xpath("//tbody/tr/td[2]/table/tbody/tr/td[2]/a/text()").extract()[0]


i = 2
while i < 200:
next_page_button = browser.find_element(By.XPATH, f"//a[@onclick='representMembersChange({i},this)']")
next_page_button.click()
# 给浏览器加载JS的时间
if i % 5 == 0:
time.sleep(3)
# seq = #groupRepresentativeSection>div>table>tbody>tr>td
# //*[@id='groupRepresentativeSection']/div/table/tbody/tr/td/text() 1, 2, 3, 4,
for num in range(10):
seq = detail.xpath("//*[@id='groupRepresentativeSection']/div/table/tbody/tr/td/text()").extract()[num]
company_name = detail.xpath("//*[@id='groupRepresentativeSection']/div/table/tbody/tr/td[@class='left-col']")
# company_name = detail.xpath("//tbody/tr/td[1]/table/tbody/tr/td[1]/div/a/text()").extract()[0]
info = Info()
info.seq = i
info.company_name = i
info.legal_representative = "空"
info.register_capital = '空'
info.build_date = "空"
info.state = "空"

info.save(force_insert=True)
time.sleep(2)
i += 1

def get_all_page():
# 为了减少代码的冗余,这里写的是点击的逻辑(首先不点击,直接先获取一次数据page1
get_info()


if __name__ == '__main__':
login()
# 改变CSS的样式显示没有移动方块的图
# 这里就手动登录验证吧,代码截不到图啊
time.sleep(5)
# 开始取数据
get_info()