On commence par modifier l'exemple URLLister
du cours pour récupérer les liens vers les images.
import time, urllib, os, sys
from html.parser import HTMLParser
from urllib.parse import *
fmt='%Y-%m-%d_%H-%M-%S' # Formatage des dates
class ImgLister(HTMLParser):
def reset(self):
self.imgs = []
HTMLParser.reset(self)
def handle_starttag(self, tag, attrs):
if tag == 'img':
src = [v for k, v in attrs if k=='src']
if src:
self.imgs.extend(src)
def handle_startendtag(self, tag, attrs):
if tag == 'img':
src = [v for k, v in attrs if k=='src']
if src:
self.imgs.extend(src)
url = 'http://igm.univ-mlv.fr/~jyt'
with urllib.request.urlopen(url) as f:
s = f.read() # ce sont des bytes, il faut décoder avant de parser
encoding = f.headers.get_charset() # sur cet exemple, c'est None
if encoding: s = s.decode(encoding)
else:
for e in ['utf8', 'latin1']: # alors on esssaie de décoder autrement
try: s = s.decode(e)
except: pass
s[:100] # Ça marche ...
p = ImgLister()
p.feed(s)
p.close()
p.imgs
def list_images(url):
with urllib.request.urlopen(url) as f:
s = f.read()
e = f.headers.get_param('charset')
if e: s = s.decode(encoding)
else:
for e in ['utf8', 'latin1']:
try: s = s.decode(e)
except: pass
p = ImgLister()
p.feed(s)
p.close()
return [urljoin(url,i,False) for i in p.imgs] # On complète les urls
list_images(url)
def download_images(url):
imgs = list_images(url)
for i in imgs:
name = time.strftime(fmt)+'_'+os.path.split(urlsplit(i).path)[-1]
data = urllib.request.urlopen(i).read()
open(name,'wb').write(data) # On peut aussi utiliser urlretrive
#!/usr/bin/python3
"""
dnld-imgs: download images from url to the current directory
Usage: dnld-imgs <url>
"""
import time, urllib, os, sys
from html.parser import HTMLParser
from urllib.parse import *
fmt='%Y-%m-%d_%H-%M-%S'
class ImgLister(HTMLParser):
def reset(self):
self.imgs = []
HTMLParser.reset(self)
def handle_starttag(self, tag, attrs):
if tag == 'img':
src = [v for k, v in attrs if k=='src']
if src:
self.imgs.extend(src)
def handle_startendtag(self, tag, attrs):
if tag == 'img':
src = [v for k, v in attrs if k=='src']
if src:
self.imgs.extend(src)
def list_images(url):
with urllib.request.urlopen(url) as f:
s = f.read()
e = f.headers.get_param('charset')
if e: s = s.decode(encoding)
else:
for e in ['utf8', 'latin1']:
try: s = s.decode(e)
except: pass
p = ImgLister()
p.feed(s)
p.close()
return [urljoin(url,i,False) for i in p.imgs]
def download_images(url):
imgs = list_images(url)
for i in imgs:
name = time.strftime(fmt)+'_'+os.path.split(urlsplit(i).path)[-1]
data = urllib.request.urlopen(i).read()
open(name,'wb').write(data)
if __name__ == '__main__':
url = sys.argv[1]
try:
download_images(url)
sys.exit(0)
except:
print __doc__
sys.exit(1)