Python 網頁爬蟲

開發流程:

分析網站原始碼，取得圖片標籤或屬性
爬取圖片所在網址 (來源網址)
下載圖片到資料夾裡面

所需套件:
pip install requests
pip install beautifulsoup4
pip install lxml

匯入模組:
from bs4 import BeautifulSoup
import requests
import os

from bs4 import BeautifulSoup
import requests
import os

name = input("input the download img：")
 
html = requests.get(f"https://unsplash.com/s/photos/{input_image}")
soup = BeautifulSoup(response.text, "lxml")
 
results = soup.find_all("img", {"class": "_2VWD4 _2zEKz"}, limit=5)

selenium 實作


import datetime
import matplotlib.pyplot as plt
from selenium import webdriver
import urllib.request
from selenium.webdriver.support.ui import Select
n = input()
w = input()
time_ = input()

driverPath = 'chromedriver.exe'
b = webdriver.Chrome(executable_path=driverPath) 
b.get('https://www.cwb.gov.tw/V8/C/W/OBS_Sat.html')

b.find_element_by_id('Tab' + n).click()
b.find_element_by_xpath("//label[@for='area"+w+"']").click()
sel = Select(b.find_element_by_id('selectday'))

format_time = datetime.datetime.strptime(time_,"%Y/%m/%d %H:%M")
#print(format_time)
#print(format_time.strftime("%Y/%m/%d %H:%M"))
#print(time_)
pic_src = []
for i in range(6):
    sel.select_by_visible_text(format_time.strftime("%Y/%m/%d %H:%M"))
    format_time = format_time + datetime.timedelta(minutes =- 10)
    pic_src.append(b.find_element_by_xpath('//img[@alt="衛星雲圖"]').get_attribute('src'))
print(pic_src)
for idx,pic in enumerate(pic_src):
    plt.subplot(2,3,idx+1)
    s = urllib.request.urlopen(pic)
    p = plt.imread(s,'jpg')
    plt.imshow(p)
plt.show()
#2021/05/07 15:10