python-tutorials/tests/DoCcrawler.py

78 lines
2.5 KiB
Python
Raw Permalink Normal View History

2023-09-20 00:22:26 -04:00
import time
import json
import stomp
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# 待查询信息
AMQHOST = "mq.ossez.com"
AMQPORT = 61616
AMQUSER = "artemis"
AMQPASS = "artemis"
TOPICNAME = "policyQueue"
begin_year = input("请输入最早年份:")
current_timestamp = time.time()
time_tuple = time.localtime(current_timestamp)
end_year = time_tuple.tm_year
wait_time = 0.5 # 等待时间
action_pixel = 100 # 鼠标滚动像素
# get网站
chrome = Chrome(service=Service(r"C:\Users\yhu\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"))
chrome.get('https://www.isharkfly.com/')
wait = WebDriverWait(chrome, 10)
rowContent = chrome.find_elements(By.XPATH, '/html/body/div[3]/div/div/div/div[4]/div/table/tbody/tr')
main_window = chrome.current_window_handle
def send_mq(ctxInfo, ctx):
for summary in ctxInfo.splitlines():
if summary.startswith('索引号'):
index_string = summary.split('')[1]
elif summary.startswith('发文日期'):
policyDateTime = summary.split('')[1]
elif summary.startswith('名称'):
policy_name = summary.split('')[1]
data = {}
data['policy_index_number'] = index_string
data['policy_title'] = policy_name
data['policy_content'] = ctx
hosts = [(AMQHOST, AMQPORT)]
conn = stomp.Connection(host_and_ports=hosts, auto_content_length=False)
conn.connect(username=AMQUSER, passcode=AMQPASS, wait=True)
conn.send(body=json.dumps(data), destination=TOPICNAME)
time.sleep(1)
conn.disconnect()
for tr in rowContent:
tdList = tr.find_elements("xpath", 'td')
indexNumber = tdList[0].text
docName = tdList[1].text
tdList[1].find_element(By.TAG_NAME, 'a').click()
docDate = tdList[2].text
wait.until(EC.number_of_windows_to_be(2))
# Loop through until we find a new window handle
for window_handle in chrome.window_handles:
if window_handle != main_window:
chrome.switch_to.window(window_handle)
break
ctxInfo = chrome.find_element(By.XPATH, '/html/body/div[3]/div/div/div/div[2]/div/div[1]/ul').text
ctx = chrome.find_element(By.XPATH, '/html/body/div[3]/div/div/div/div[2]/div/div[2]').text
send_mq(ctxInfo, ctx)
chrome.close()
chrome.switch_to.window(main_window)
print(docName)
# break
chrome.quit()