Wednesday, January 18, 2012

Naruto downloader from managreader.net

After reading Naruto upto chapter 520 (courtesy naruto with elisp) I was eager to read the rest. As of today the latest chapter is 570. I found mangareader.net after a bit of googling and was busy reading my way through the chapters. But, like before reading in the browser was not up to my taste. Currently the mcomix is the comic reader of my choice. So, I set about to write a script to automatically download the remaining chapters from mangareader.net. I do not know if its wrong to do so, the site does not have any terms of use :|
import re
from urllib2 import urlopen
from zipfile import ZipFile, ZIP_DEFLATED
from xml.dom.minidom import parseString

def get_info(line, alt_regex):
    try:
        line = line[:line.index('</a>')] + '</a>'
        line = line[line.index('<a href'):]
        dom = parseString(line)
        info = {}
        a = dom.getElementsByTagName('a')[0]
        info['next'] = a.getAttribute('href')
        img = a.getElementsByTagName('img')[0]
        info['img_url'] = img.getAttribute('src')
        info['img_ext'] = info['img_url'][info['img_url'].rindex('.') + 1:]
        alt = img.getAttribute('alt')
        m = re.search(alt_regex, alt)
        info['chapter'] = int(m.group(1))
        info['page'] = int(m.group(2))
        dom.unlink()
        return info
    except Exception as e:
        print('[ERROR] %s' % line)
        print('[ERROR] ' + e)

def get_image(url):
    try:
        f = urlopen(url)
        b = f.read()
        f.close()
        return b
    except Exception as e:
        print('[ERROR] ' + e)

def get_chapter(url_prefix, url_suffix, title, chapter=1):
    need_more = True
    alt_regex = re.compile(r'%s (\d+) - Page (\d+)' % title)
    cbz = ZipFile('%03d.cbz' % chapter, "w", ZIP_DEFLATED)
    url = '%s%s' % (url_prefix, url_suffix)
    
    try:
        while ( need_more ):
            f = urlopen(url)
            lines = f.readlines()
            f.close()
            line = filter(lambda x: x.find('id="img"') != -1, lines)[0]
            info = get_info(line, alt_regex)
            need_more = info['chapter'] == chapter
            if ( need_more ):
                cbz.writestr('%02d.%s' % (info['page'], info['img_ext']), get_image(info['img_url']))
                url = '%s%s' % (url_prefix, info['next'])
            else:
                # new chapter
                cbz.close()
                chapter = info['chapter']
                need_more = True
                cbz = ZipFile('%03d.cbz' % chapter, "w", ZIP_DEFLATED)
    except IndexError:
        pass # image not found so end of chapter

get_chapter('http://www.mangareader.net', '/naruto/521', 'Naruto')
So, I let it rip and now I'm busy reading... :)

No comments: