Back

Automatic audio extraction from NHK News Easy?

#2
I use something that I brewed myself - sorry, but I don't have time to provide any kind of support, so it's just you and this code:
Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import re
import subprocess
import string
import urllib
from HTMLParser import HTMLParser

track_nb = 64
url_base = 'http://www3.nhk.or.jp/news/easy/'
time_stamp = '2014-11'

class NHKArticleParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.data = ''
        self.div_cnt = 0
        self.div_cnt_article = 0
        self.is_rt = False
        self.is_img = False
        self.img_url = ''

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            if ('id', 'mainimg') in attrs:
                self.is_img = True
                
        elif tag == 'img' and self.is_img:
            for attr in attrs:
                if attr[0] == 'src':
                    self.img_url = attr[1]

        elif tag == 'div':
            self.div_cnt += 1
            if ('id', 'newstitle') in attrs or ('id', 'newsarticle') in attrs:
                self.div_cnt_article = self.div_cnt
        
        elif tag == 'rt':
            self.is_rt = True

    def handle_endtag(self, tag):
        if tag == 'p' and self.is_img:
            self.is_img = False
        
        elif tag == 'rt':
            self.is_rt = False
        
        elif tag == 'div':
            self.div_cnt -= 1
            if self.div_cnt < self.div_cnt_article:
                self.div_cnt_article = 0

    def handle_data(self, data):
        if self.div_cnt_article > 0 and not self.is_rt:
            self.data += data

# Parser for newslisteven.html
# newslisteven.html is a file that results from saving 'Copy as HTML' of
# <ul class="newslisteven"> from http://www3.nhk.or.jp/news/easy/index.html
class NHKNewsListEvenParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.articles = []
        
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            #print attrs[0][1][2:]
            self.articles.append(attrs[0][1][2:])

def get_article(article_id):
    global url_base
    global track_nb
    fn_img_tmp = 'fn_img'
    fn_mp3_tmp = 'fn_mp3'

    url = url_base + article_id
    (filename, headers) = urllib.urlretrieve(url)

    parser = NHKArticleParser()
    article = open(filename).read()
    article_m = article.replace('"shorturl"content', '"shorturl" content')
    parser.feed(article_m)
    parser.close()

    img_url = parser.img_url
    if img_url[0: 4] != 'http':
        img_url = os.path.dirname(url) + '/' + img_url
    urllib.urlretrieve(img_url, fn_img_tmp)
    
    data = re.sub('\n +', '\n', parser.data).lstrip('\n')
    lines = data.split('\n')
    title = lines[1] + lines[0]

    # Make sure the title is a valid filename
    title = ''.join(ch for ch in title if ch not in '\/:*?<>|')
    print title
    
    data = ''
    for line in lines[0: 3]:
        data = data + line + '\n'
    for line in lines[3:]:
        if len(line) > 0:
            data = data + line + '\n'

    file = open(unicode(title + '.txt', 'utf8'), 'w')
    file.write(data)
    file.close()

    pos = article_id.find('html')
    url_mp3 = url_base + article_id[0: pos] + 'mp3'
    print url_mp3, '-', track_nb
    fn_mp3 = unicode(title + '.mp3', 'utf8')

    urllib.urlretrieve(url_mp3, fn_mp3_tmp)

    command = [
        'python',
        r'C:\Users\<your-profile-name>\Commands\eyeD3',
        '--remove-all',
        '--no-color',
        '--to-v2.3',
        '--set-encoding=utf16-LE',
        '--force-update',
        '--artist=NHKニュース',
        '--album=NHKニュース - ' + time_stamp,
        '--genre=Non-fiction',
        '--title=' + title,
        '--track=' + str(track_nb),
        '--add-image=' + fn_img_tmp + ':FRONT_COVER',
        fn_mp3_tmp
    ]
    
    subprocess.call(
        command,
        shell = True
    )
    track_nb += 1
    os.rename(fn_mp3_tmp, fn_mp3)
    os.remove(fn_img_tmp)

# Instantiate the parser and feed it some .html
parser = NHKNewsListEvenParser()
parser.feed(open('newslisteven.html').read())
parser.close()
for article in reversed(parser.articles):
    get_article(article)
Edit:
I've found some time now to record the steps to make use of the above script.

1. Save the python code to the dl.py file
If you don't care about tagging the mp3 files, then
2. Remove lines 118-138
(otherwise things get even messier...)

In Google Chrome (I believe that other browsers allow for similar workflow as well)
1. Open "http://www3.nhk.or.jp/news/easy/index.html"
2. Select "11月26日(水)"
3. Right-click > Inspect Element on "「黒人を撃った警察官を訴えない」アメリカ中で抗議"

In Developer Tools:
1. Locate '<ul class="newslisteven">'
2. Right-click > Copy '<ul class="newslisteven">'
3. Save the Clipboard to the newslisteven.html file (this file has to be in the same directory as the dl.py script)

Command box part:
1. Open the command window
2. Change the current directory to the location where dl.py and newslisteven.html are
3. Execute 'python -u dl.py'

After execution of the script finishes you should see .txt and .mp3 files for 11月26日(水).
Edited: 2014-11-26, 5:20 am
Reply

Messages In This Thread