python-tutorials/tests/DoCcrawler.py

78 lines
2.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import json
import stomp
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# 待查询信息
AMQHOST = "mq.ossez.com"
AMQPORT = 61616
AMQUSER = "artemis"
AMQPASS = "artemis"
TOPICNAME = "policyQueue"
begin_year = input("请输入最早年份:")
current_timestamp = time.time()
time_tuple = time.localtime(current_timestamp)
end_year = time_tuple.tm_year
wait_time = 0.5 # 等待时间
action_pixel = 100 # 鼠标滚动像素
# get网站
chrome = Chrome(service=Service(r"C:\Users\yhu\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"))
chrome.get('https://www.isharkfly.com/')
wait = WebDriverWait(chrome, 10)
rowContent = chrome.find_elements(By.XPATH, '/html/body/div[3]/div/div/div/div[4]/div/table/tbody/tr')
main_window = chrome.current_window_handle
def send_mq(ctxInfo, ctx):
for summary in ctxInfo.splitlines():
if summary.startswith('索引号'):
index_string = summary.split('')[1]
elif summary.startswith('发文日期'):
policyDateTime = summary.split('')[1]
elif summary.startswith('名称'):
policy_name = summary.split('')[1]
data = {}
data['policy_index_number'] = index_string
data['policy_title'] = policy_name
data['policy_content'] = ctx
hosts = [(AMQHOST, AMQPORT)]
conn = stomp.Connection(host_and_ports=hosts, auto_content_length=False)
conn.connect(username=AMQUSER, passcode=AMQPASS, wait=True)
conn.send(body=json.dumps(data), destination=TOPICNAME)
time.sleep(1)
conn.disconnect()
for tr in rowContent:
tdList = tr.find_elements("xpath", 'td')
indexNumber = tdList[0].text
docName = tdList[1].text
tdList[1].find_element(By.TAG_NAME, 'a').click()
docDate = tdList[2].text
wait.until(EC.number_of_windows_to_be(2))
# Loop through until we find a new window handle
for window_handle in chrome.window_handles:
if window_handle != main_window:
chrome.switch_to.window(window_handle)
break
ctxInfo = chrome.find_element(By.XPATH, '/html/body/div[3]/div/div/div/div[2]/div/div[1]/ul').text
ctx = chrome.find_element(By.XPATH, '/html/body/div[3]/div/div/div/div[2]/div/div[2]').text
send_mq(ctxInfo, ctx)
chrome.close()
chrome.switch_to.window(main_window)
print(docName)
# break
chrome.quit()