TD 4 - Exercice 1¶

On commence par modifier l'exemple URLLister du cours pour récupérer les liens vers les images.

import time, urllib, os, sys
from html.parser import HTMLParser
from urllib.parse import *


fmt='%Y-%m-%d_%H-%M-%S' # Formatage des dates

class ImgLister(HTMLParser):
    def reset(self):
        self.imgs = []
        HTMLParser.reset(self)

    def handle_starttag(self, tag,  attrs):
        if tag == 'img':
            src = [v for k, v in attrs if k=='src']
            if src:
                self.imgs.extend(src)

    def handle_startendtag(self, tag,  attrs):
        if tag == 'img':
            src = [v for k, v in attrs if k=='src']
            if src:
                self.imgs.extend(src)

url = 'http://igm.univ-mlv.fr/~jyt'

with urllib.request.urlopen(url) as f:
    s = f.read() # ce sont des bytes, il faut décoder avant de parser
    encoding = f.headers.get_charset() # sur cet exemple, c'est None
    if encoding: s = s.decode(encoding)
    else:
        for e in ['utf8', 'latin1']: # alors on esssaie de décoder autrement
            try: s = s.decode(e)
            except: pass

s[:100] # Ça marche ...

'<HTML>\n<HEAD>\n<link rel="icon" href="favicon.ico" type="image/x-icon">\n<link rel="shortcut icon" hre'

p = ImgLister()
p.feed(s)
p.close()

p.imgs

['/images/site/logoigm.gif',
 '/images/site/titre-petit.jpg',
 '/images/site/barre.gif',
 '/images/site/logoCNRS.jpg',
 'thibon.gif',
 'mail.gif',
 'iuf.gif',
 'http://scholar.google.com/scholar/scholar_sm.gif']

def list_images(url):
    with urllib.request.urlopen(url) as f:
        s = f.read()
        e = f.headers.get_param('charset')
    if e: s = s.decode(encoding)
    else:
        for e in ['utf8', 'latin1']:
            try: s = s.decode(e)
            except: pass
    
    p = ImgLister()
    p.feed(s)
    p.close()
    return [urljoin(url,i,False) for i in p.imgs] # On complète les urls

list_images(url)

['http://igm.univ-mlv.fr/images/site/logoigm.gif',
 'http://igm.univ-mlv.fr/images/site/titre-petit.jpg',
 'http://igm.univ-mlv.fr/images/site/barre.gif',
 'http://igm.univ-mlv.fr/images/site/logoCNRS.jpg',
 'http://igm.univ-mlv.fr/thibon.gif',
 'http://igm.univ-mlv.fr/mail.gif',
 'http://igm.univ-mlv.fr/iuf.gif',
 'http://scholar.google.com/scholar/scholar_sm.gif']

def download_images(url):
    imgs = list_images(url)
    for i in imgs:
        name = time.strftime(fmt)+'_'+os.path.split(urlsplit(i).path)[-1]
        data = urllib.request.urlopen(i).read()
        open(name,'wb').write(data) # On peut aussi utiliser urlretrive

Le script complet¶

#!/usr/bin/python3
"""
dnld-imgs: download images from url to the current directory

Usage:  dnld-imgs  <url>
"""
import time, urllib, os, sys
from html.parser import HTMLParser
from urllib.parse import *


fmt='%Y-%m-%d_%H-%M-%S'

class ImgLister(HTMLParser):
    def reset(self):
        self.imgs = []
        HTMLParser.reset(self)

    def handle_starttag(self, tag,  attrs):
        if tag == 'img':
            src = [v for k, v in attrs if k=='src']
            if src:
                self.imgs.extend(src)

    def handle_startendtag(self, tag,  attrs):
        if tag == 'img':
            src = [v for k, v in attrs if k=='src']
            if src:
                self.imgs.extend(src)

def list_images(url):
    with urllib.request.urlopen(url) as f:
        s = f.read()
        e = f.headers.get_param('charset')
    if e: s = s.decode(encoding)
    else:
        for e in ['utf8', 'latin1']:
            try: s = s.decode(e)
            except: pass

    p = ImgLister()
    p.feed(s)
    p.close()
    return [urljoin(url,i,False) for i in p.imgs]


def download_images(url):
    imgs = list_images(url)
    for i in imgs:
        name = time.strftime(fmt)+'_'+os.path.split(urlsplit(i).path)[-1]
        data = urllib.request.urlopen(i).read()
        open(name,'wb').write(data)


if __name__ == '__main__':
    url = sys.argv[1]
    try:
        download_images(url)
        sys.exit(0)
    except:
        print __doc__
        sys.exit(1)