chap
tree -A
simple_server/index.html
<!DOCTYPE html><html lang="en"> <head><title>Cool Owls!</title></head> <body> <h1>Welcome to my owl gallery</h1> <div> <img src="img/owl-alcohol.png" height="128" /> <img src="img/owl-book.png" height="128" /> <img src="img/owl-books.png" height="128" /> <img src="img/owl-ebook.jpg" height="128" /> <img src="img/owl-rose.jpeg" height="128" /> </div> <p>Do you like my owls?</p> </body></html>
$ python -m http.server 8000
$ ./serve.sh
First approach – scripting
The imports
scrape.py (Imports)
import argparse
import base64
import json
import os
from bs4 import BeautifulSoup
import requests
$ pip freeze | egrep -i "soup|requests"
$ pip install beautifulsoup4 requests
Parsing arguments
scrape.py (Argument parsing and scraper triggering)
if __name__ == "__main__":
parser = argparse.ArgumentParser( description='Scrape a webpage.')
parser.add_argument( '-t', '--type', choices=['all', 'png', 'jpg'], default='all', help='The image type we want to scrape.')
parser.add_argument( '-f', '--format', choices=['img', 'json'], default='img', help='The format images are saved to.')
parser.add_argument( 'url', help='The URL we want to scrape for images.')
args = parser.parse_args()
scrape(args.url, args.format, args.type)
$ python scrape.py -h
$ python scrape.py http://localhost:8000
$ python scrape.py -t png http://localhost:8000
$ python scrape.py --type=jpg -f json http://localhost:8000
The business logic
scrape.py (Business logic)
def scrape(url, format_, type_):
try:
page = requests.get(url)
except requests.RequestException as rex:
print(str(rex))
else:
soup = BeautifulSoup(page.content, 'html.parser')
images = _fetch_images(soup, url)
images = _filter_images(images, type_)
_save(images, format_)
def _fetch_images(soup, base_url):
images = []
for img in soup.findAll('img'):
src = img.get('src')
img_url = ( '{base_url}/{src}'.format( base_url=base_url, src=src))
name = img_url.split('/')[-1]
images.append(dict(name=name, url=img_url))
return images
def _filter_images(images, type_):
if type_ == 'all':
return images
ext_map = { 'png': ['.png'], 'jpg': ['.jpg', '.jpeg'], }
return [ img for img in images
if _matches_extension(img['name'], ext_map[type_])
]
def _matches_extension(filename, extension_list):
name, extension = os.path.splitext(filename.lower())
return extension in extension_list
def _save(images, format_):
if images:
if format_ == 'img':
_save_images(images)
else:
_save_json(images)
print('Done')
else:
print('No images to save.')
def _save_images(images):
for img in images:
img_data = requests.get(img['url']).content
with open(img['name'], 'wb') as f:
f.write(img_data)
def _save_json(images):
data = {}
for img in images:
img_data = requests.get(img['url']).content
b64_img_data = base64.b64encode(img_data)
str_img_data = b64_img_data.decode('utf-8')
data[img['name']] = str_img_data
with open('images.json', 'w') as ijson:
ijson.write(json.dumps(data))
images.json (truncated)
{ "owl-ebook.jpg": "/9j/4AAQSkZJRgABAQEAMQAxAAD/2wBDAAEBAQ...
"owl-book.png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAEbCAYAAAB...
"owl-books.png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAElCAYAAA...
"owl-alcohol.png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAEICAYA...
"owl-rose.jpeg": "/9j/4AAQSkZJRgABAQEANAA0AAD/2wBDAAEBAQ...
}
-- Second approach – a GUI application
$ python -m tkinter
The imports
from tkinter import *
from tkinter import ttk, filedialog, messagebox
import base64import json
import osfrom bs4
import BeautifulSoup
import requests
The layout logic
guiscrape.py
if __name__ == "__main__":
_root = Tk()
_root.title('Scrape app')
_mainframe = ttk.Frame(_root, padding='5 5 5 5')
_mainframe.grid(row=0, column=0, sticky=(E, W, N, S))
_url_frame = ttk.LabelFrame(
_mainframe, text='URL', padding='5 5 5 5')
_url_frame.grid(row=0, column=0, sticky=(E, W))
_url_frame.columnconfigure(0, weight=1)
_url_frame.rowconfigure(0, weight=1)
_url = StringVar()
_url.set('http://localhost:8000')
_url_entry = ttk.Entry(
_url_frame, width=40, textvariable=_url)
_url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)
_fetch_btn = ttk.Button(
_url_frame, text='Fetch info', command=fetch_url)
_fetch_btn.grid(row=0, column=1, sticky=W, padx=5)
_img_frame = ttk.LabelFrame(
_mainframe, text='Content', padding='9 0 0 0')
_img_frame.grid(row=1, column=0, sticky=(N, S, E, W))
_images = StringVar()
_img_listbox = Listbox(
_img_frame, listvariable=_images, height=6, width=25)
_img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)
_scrollbar = ttk.Scrollbar(
_img_frame, orient=VERTICAL, command=_img_listbox.yview)
_scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)
_img_listbox.configure(yscrollcommand=_scrollbar.set)
_radio_frame = ttk.Frame(_img_frame)
_radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))
_choice_lbl = ttk.Label(
_radio_frame, text="Choose how to save images")
_choice_lbl.grid(row=0, column=0, padx=5, pady=5)
_save_method = StringVar()
_save_method.set('img')
_img_only_radio = ttk.Radiobutton(
_radio_frame, text='As Images', variable=_save_method,
value='img')
_img_only_radio.grid(
row=1, column=0, padx=5, pady=2, sticky=W)
_img_only_radio.configure(state='normal')
_json_radio = ttk.Radiobutton(
_radio_frame, text='As JSON', variable=_save_method,
value='json')
_json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)
_scrape_btn = ttk.Button(
_mainframe, text='Scrape!', command=save)
_scrape_btn.grid(row=2, column=0, sticky=E, pady=5)
_status_frame = ttk.Frame(
_root, relief='sunken', padding='2 2 2 2')
_status_frame.grid(row=1, column=0, sticky=(E, W, S))
_status_msg = StringVar()
_status_msg.set('Type a URL to start scraping...')
_status = ttk.Label(
_status_frame, textvariable=_status_msg, anchor=W)
_status.grid(row=0, column=0, sticky=(E, W))
_root.mainloop()
The business logic
Fetching the web page
config = {}def fetch_url():
url = _url.get()
config['images'] = []
_images.set(()) # initialized as an empty tuple
try:
page = requests.get(url)
except requests.RequestException as rex:
_sb(str(rex))
else:
soup = BeautifulSoup(page.content, 'html.parser')
images = fetch_images(soup, url)
if images:
_images.set(tuple(img['name'] for img in images))
_sb('Images found: {}'.format(len(images)))
else:
_sb('No images found')
config['images'] = imagesdef
fetch_images(soup, base_url):
images = []
for img in soup.findAll('img'):
src = img.get('src')
img_url = (
'{base_url}/{src}'.format(base_url=base_url, src=src))
name = img_url.split('/')[-1]
images.append(dict(name=name, url=img_url))
return images
Saving the images
def save():
if not config.get('images'):
_alert('No images to save')
return
if _save_method.get() == 'img':
dirname = filedialog.askdirectory(mustexist=True)
_save_images(dirname)
else:
filename = filedialog.asksaveasfilename(
initialfile='images.json',
filetypes=[('JSON', '.json')])
_save_json(filename)def _save_images(dirname):
if dirname and config.get('images'):
for img in config['images']:
img_data = requests.get(img['url']).content
filename = os.path.join(dirname, img['name'])
with open(filename, 'wb') as f:
f.write(img_data)
_alert('Done')
def _save_json(filename):
if filename and config.get('images'):
data = {}
for img in config['images']:
img_data = requests.get(img['url']).content
b64_img_data = base64.b64encode(img_data)
str_img_data = b64_img_data.decode('utf-8')
data[img['name']] = str_img_data
with open(filename, 'w') as ijson:
ijson.write(json.dumps(data))
_alert('Done')
Alerting the user
def _sb(msg):
_status_msg.set(msg)
def _alert(msg):
messagebox.showinfo(message=msg)
How to improve the application?
with open('images.json', 'r') as f:
data = json.loads(f.read())
for (name, b64val) in data.items():
with open(name, 'wb') as f:
f.write(base64.b64decode(b64val))
-- Where do we go from here?
The tkinter.tix module
The turtle module
wxPython, PyQt, and PyGTK
The principle of least astonishment
Threading considerations
-- Summary
No comments:
Post a Comment