Python 網頁爬蟲

開發流程:

  • 分析網站原始碼,取得圖片標籤或屬性
  • 爬取圖片所在網址 (來源網址)
  • 下載圖片到資料夾裡面

所需套件:
pip install requests
pip install beautifulsoup4
pip install lxml

匯入模組:
from bs4 import BeautifulSoup
import requests
import os

1
2
3
4
5
6
7
8
9
10
from bs4 import BeautifulSoup
import requests
import os

name = input("input the download img:")

html = requests.get(f"https://unsplash.com/s/photos/{input_image}")
soup = BeautifulSoup(response.text, "lxml")

results = soup.find_all("img", {"class": "_2VWD4 _2zEKz"}, limit=5)

selenium 實作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

import datetime
import matplotlib.pyplot as plt
from selenium import webdriver
import urllib.request
from selenium.webdriver.support.ui import Select
n = input()
w = input()
time_ = input()

driverPath = 'chromedriver.exe'
b = webdriver.Chrome(executable_path=driverPath)
b.get('https://www.cwb.gov.tw/V8/C/W/OBS_Sat.html')

b.find_element_by_id('Tab' + n).click()
b.find_element_by_xpath("//label[@for='area"+w+"']").click()
sel = Select(b.find_element_by_id('selectday'))

format_time = datetime.datetime.strptime(time_,"%Y/%m/%d %H:%M")
#print(format_time)
#print(format_time.strftime("%Y/%m/%d %H:%M"))
#print(time_)
pic_src = []
for i in range(6):
sel.select_by_visible_text(format_time.strftime("%Y/%m/%d %H:%M"))
format_time = format_time + datetime.timedelta(minutes =- 10)
pic_src.append(b.find_element_by_xpath('//img[@alt="衛星雲圖"]').get_attribute('src'))
print(pic_src)
for idx,pic in enumerate(pic_src):
plt.subplot(2,3,idx+1)
s = urllib.request.urlopen(pic)
p = plt.imread(s,'jpg')
plt.imshow(p)
plt.show()
#2021/05/07 15:10