chap 
tree -A
simple_server/index.html
<!DOCTYPE html><html lang="en">  <head><title>Cool Owls!</title></head>  <body>    <h1>Welcome to my owl gallery</h1>    <div>      <img src="img/owl-alcohol.png" height="128" />      <img src="img/owl-book.png" height="128" />      <img src="img/owl-books.png" height="128" />      <img src="img/owl-ebook.jpg" height="128" />      <img src="img/owl-rose.jpeg" height="128" />    </div>    <p>Do you like my owls?</p>  </body></html>
$ python -m http.server 8000
$ ./serve.sh
First approach – scripting
The imports
scrape.py (Imports)
import argparse
import base64
import json
import os
from bs4 import BeautifulSoup
import requests
$ pip freeze | egrep -i "soup|requests"
$ pip install beautifulsoup4 requests
Parsing arguments
scrape.py (Argument parsing and scraper triggering)
if __name__ == "__main__":    
parser = argparse.ArgumentParser(        description='Scrape a webpage.')    
parser.add_argument(        '-t',        '--type',        choices=['all', 'png', 'jpg'],        default='all',        help='The image type we want to scrape.')    
parser.add_argument(        '-f',        '--format',        choices=['img', 'json'],        default='img',        help='The format images are saved to.')    
parser.add_argument(        'url',        help='The URL we want to scrape for images.')    
args = parser.parse_args()    
scrape(args.url, args.format, args.type)
$ python scrape.py -h
$ python scrape.py http://localhost:8000
$ python scrape.py -t png http://localhost:8000
$ python scrape.py --type=jpg -f json http://localhost:8000
The business logic
scrape.py (Business logic)
def scrape(url, format_, type_):    
try:        
page = requests.get(url)    
except requests.RequestException as rex:        
print(str(rex))    
else:        
soup = BeautifulSoup(page.content, 'html.parser')        
images = _fetch_images(soup, url)        
images = _filter_images(images, type_)        
_save(images, format_)
def _fetch_images(soup, base_url):    
images = []    
for img in soup.findAll('img'):        
src = img.get('src')        
img_url = (            '{base_url}/{src}'.format(                base_url=base_url, src=src))        
name = img_url.split('/')[-1]        
images.append(dict(name=name, url=img_url))    
return images
def _filter_images(images, type_):    
if type_ == 'all':        
return images   
ext_map = {        'png': ['.png'],        'jpg': ['.jpg', '.jpeg'],    }    
return [        img for img in images        
if _matches_extension(img['name'], ext_map[type_])    
]
def _matches_extension(filename, extension_list):    
name, extension = os.path.splitext(filename.lower())   
return extension in extension_list
def _save(images, format_):    
if images:        
if format_ == 'img':            
_save_images(images)        
else:            
_save_json(images)        
print('Done')    
else:        
print('No images to save.')
def _save_images(images):    
for img in images:        
img_data = requests.get(img['url']).content        
with open(img['name'], 'wb') as f:            
f.write(img_data)
def _save_json(images):    
data = {}    
for img in images:        
img_data = requests.get(img['url']).content        
b64_img_data = base64.b64encode(img_data)        
str_img_data = b64_img_data.decode('utf-8')        
data[img['name']] = str_img_data    
with open('images.json', 'w') as ijson:        
ijson.write(json.dumps(data))
images.json (truncated)
{  "owl-ebook.jpg": "/9j/4AAQSkZJRgABAQEAMQAxAAD/2wBDAAEBAQ...  
"owl-book.png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAEbCAYAAAB...  
"owl-books.png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAElCAYAAA...  
"owl-alcohol.png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAEICAYA...  
"owl-rose.jpeg": "/9j/4AAQSkZJRgABAQEANAA0AAD/2wBDAAEBAQ...
}
-- Second approach – a GUI application
$ python -m tkinter
The imports
from tkinter import *
from tkinter import ttk, filedialog, messagebox
import base64import json
import osfrom bs4 
import BeautifulSoup
import requests
The layout logic
guiscrape.py
if __name__ == "__main__":    
_root = Tk()    
_root.title('Scrape app')
_mainframe = ttk.Frame(_root, padding='5 5 5 5')    
_mainframe.grid(row=0, column=0, sticky=(E, W, N, S))
_url_frame = ttk.LabelFrame(        
_mainframe, text='URL', padding='5 5 5 5')    
_url_frame.grid(row=0, column=0, sticky=(E, W))   
_url_frame.columnconfigure(0, weight=1)    
_url_frame.rowconfigure(0, weight=1)
_url = StringVar()    
_url.set('http://localhost:8000')    
_url_entry = ttk.Entry(        
_url_frame, width=40, textvariable=_url)    
_url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)    
_fetch_btn = ttk.Button(        
_url_frame, text='Fetch info', command=fetch_url)    
_fetch_btn.grid(row=0, column=1, sticky=W, padx=5)
_img_frame = ttk.LabelFrame(        
_mainframe, text='Content', padding='9 0 0 0')    
_img_frame.grid(row=1, column=0, sticky=(N, S, E, W))
_images = StringVar()    
_img_listbox = Listbox(        
_img_frame, listvariable=_images, height=6, width=25)    
_img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)    
_scrollbar = ttk.Scrollbar(       
_img_frame, orient=VERTICAL, command=_img_listbox.yview)    
_scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)    
_img_listbox.configure(yscrollcommand=_scrollbar.set)
_radio_frame = ttk.Frame(_img_frame)    
_radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))
_choice_lbl = ttk.Label(        
_radio_frame, text="Choose how to save images")    
_choice_lbl.grid(row=0, column=0, padx=5, pady=5)    
_save_method = StringVar()    
_save_method.set('img')    
_img_only_radio = ttk.Radiobutton(        
_radio_frame, text='As Images', variable=_save_method,        
value='img')    
_img_only_radio.grid(        
row=1, column=0, padx=5, pady=2, sticky=W)    
_img_only_radio.configure(state='normal')    
_json_radio = ttk.Radiobutton(        
_radio_frame, text='As JSON', variable=_save_method,        
value='json')    
_json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)
_scrape_btn = ttk.Button(        
_mainframe, text='Scrape!', command=save)    
_scrape_btn.grid(row=2, column=0, sticky=E, pady=5)
_status_frame = ttk.Frame(        
_root, relief='sunken', padding='2 2 2 2')    
_status_frame.grid(row=1, column=0, sticky=(E, W, S))    
_status_msg = StringVar()    
_status_msg.set('Type a URL to start scraping...')    
_status = ttk.Label(        
_status_frame, textvariable=_status_msg, anchor=W)    
_status.grid(row=0, column=0, sticky=(E, W))
_root.mainloop()
The business logic
Fetching the web page
config = {}def fetch_url():    
url = _url.get()    
config['images'] = []    
_images.set(())   # initialized as an empty tuple    
try:        
page = requests.get(url)    
except requests.RequestException as rex:        
_sb(str(rex))    
else:        
soup = BeautifulSoup(page.content, 'html.parser')        
images = fetch_images(soup, url)        
if images:            
_images.set(tuple(img['name'] for img in images))            
_sb('Images found: {}'.format(len(images)))        
else:            
_sb('No images found')        
config['images'] = imagesdef 
fetch_images(soup, base_url):    
images = []    
for img in soup.findAll('img'):        
src = img.get('src')        
img_url = (            
'{base_url}/{src}'.format(base_url=base_url, src=src))        
name = img_url.split('/')[-1]        
images.append(dict(name=name, url=img_url))    
return images
Saving the images
def save():    
if not config.get('images'):        
_alert('No images to save')        
return    
if _save_method.get() == 'img':        
dirname = filedialog.askdirectory(mustexist=True)        
_save_images(dirname)    
else:        
filename = filedialog.asksaveasfilename(            
initialfile='images.json',            
filetypes=[('JSON', '.json')])        
_save_json(filename)def _save_images(dirname):    
if dirname and config.get('images'):        
for img in config['images']:            
img_data = requests.get(img['url']).content            
filename = os.path.join(dirname, img['name'])            
with open(filename, 'wb') as f:                
f.write(img_data)        
_alert('Done')
def _save_json(filename):    
if filename and config.get('images'):        
data = {}        
for img in config['images']:            
img_data = requests.get(img['url']).content            
b64_img_data = base64.b64encode(img_data)            
str_img_data = b64_img_data.decode('utf-8')            
data[img['name']] = str_img_data        
with open(filename, 'w') as ijson:            
ijson.write(json.dumps(data))        
_alert('Done')
Alerting the user
def _sb(msg):    
_status_msg.set(msg)
def _alert(msg):    
messagebox.showinfo(message=msg)
How to improve the application?
with open('images.json', 'r') as f:    
data = json.loads(f.read())
for (name, b64val) in data.items():    
with open(name, 'wb') as f:        
f.write(base64.b64decode(b64val))
-- Where do we go from here?
The tkinter.tix module
The turtle module
wxPython, PyQt, and PyGTK
The principle of least astonishment
Threading considerations
-- Summary
 
No comments:
Post a Comment