I use something that I brewed myself - sorry, but I don't have time to provide any kind of support, so it's just you and this code:
Edit:
I've found some time now to record the steps to make use of the above script.
1. Save the python code to the dl.py file
If you don't care about tagging the mp3 files, then
2. Remove lines 118-138
(otherwise things get even messier...)
In Google Chrome (I believe that other browsers allow for similar workflow as well)
1. Open "http://www3.nhk.or.jp/news/easy/index.html"
2. Select "11月26日(水)"
3. Right-click > Inspect Element on "「黒人を撃った警察官を訴えない」アメリカ中で抗議"
In Developer Tools:
1. Locate '<ul class="newslisteven">'
2. Right-click > Copy '<ul class="newslisteven">'
3. Save the Clipboard to the newslisteven.html file (this file has to be in the same directory as the dl.py script)
Command box part:
1. Open the command window
2. Change the current directory to the location where dl.py and newslisteven.html are
3. Execute 'python -u dl.py'
After execution of the script finishes you should see .txt and .mp3 files for 11月26日(水).
Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import subprocess
import string
import urllib
from HTMLParser import HTMLParser
track_nb = 64
url_base = 'http://www3.nhk.or.jp/news/easy/'
time_stamp = '2014-11'
class NHKArticleParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.data = ''
self.div_cnt = 0
self.div_cnt_article = 0
self.is_rt = False
self.is_img = False
self.img_url = ''
def handle_starttag(self, tag, attrs):
if tag == 'p':
if ('id', 'mainimg') in attrs:
self.is_img = True
elif tag == 'img' and self.is_img:
for attr in attrs:
if attr[0] == 'src':
self.img_url = attr[1]
elif tag == 'div':
self.div_cnt += 1
if ('id', 'newstitle') in attrs or ('id', 'newsarticle') in attrs:
self.div_cnt_article = self.div_cnt
elif tag == 'rt':
self.is_rt = True
def handle_endtag(self, tag):
if tag == 'p' and self.is_img:
self.is_img = False
elif tag == 'rt':
self.is_rt = False
elif tag == 'div':
self.div_cnt -= 1
if self.div_cnt < self.div_cnt_article:
self.div_cnt_article = 0
def handle_data(self, data):
if self.div_cnt_article > 0 and not self.is_rt:
self.data += data
# Parser for newslisteven.html
# newslisteven.html is a file that results from saving 'Copy as HTML' of
# <ul class="newslisteven"> from http://www3.nhk.or.jp/news/easy/index.html
class NHKNewsListEvenParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.articles = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
#print attrs[0][1][2:]
self.articles.append(attrs[0][1][2:])
def get_article(article_id):
global url_base
global track_nb
fn_img_tmp = 'fn_img'
fn_mp3_tmp = 'fn_mp3'
url = url_base + article_id
(filename, headers) = urllib.urlretrieve(url)
parser = NHKArticleParser()
article = open(filename).read()
article_m = article.replace('"shorturl"content', '"shorturl" content')
parser.feed(article_m)
parser.close()
img_url = parser.img_url
if img_url[0: 4] != 'http':
img_url = os.path.dirname(url) + '/' + img_url
urllib.urlretrieve(img_url, fn_img_tmp)
data = re.sub('\n +', '\n', parser.data).lstrip('\n')
lines = data.split('\n')
title = lines[1] + lines[0]
# Make sure the title is a valid filename
title = ''.join(ch for ch in title if ch not in '\/:*?<>|')
print title
data = ''
for line in lines[0: 3]:
data = data + line + '\n'
for line in lines[3:]:
if len(line) > 0:
data = data + line + '\n'
file = open(unicode(title + '.txt', 'utf8'), 'w')
file.write(data)
file.close()
pos = article_id.find('html')
url_mp3 = url_base + article_id[0: pos] + 'mp3'
print url_mp3, '-', track_nb
fn_mp3 = unicode(title + '.mp3', 'utf8')
urllib.urlretrieve(url_mp3, fn_mp3_tmp)
command = [
'python',
r'C:\Users\<your-profile-name>\Commands\eyeD3',
'--remove-all',
'--no-color',
'--to-v2.3',
'--set-encoding=utf16-LE',
'--force-update',
'--artist=NHKニュース',
'--album=NHKニュース - ' + time_stamp,
'--genre=Non-fiction',
'--title=' + title,
'--track=' + str(track_nb),
'--add-image=' + fn_img_tmp + ':FRONT_COVER',
fn_mp3_tmp
]
subprocess.call(
command,
shell = True
)
track_nb += 1
os.rename(fn_mp3_tmp, fn_mp3)
os.remove(fn_img_tmp)
# Instantiate the parser and feed it some .html
parser = NHKNewsListEvenParser()
parser.feed(open('newslisteven.html').read())
parser.close()
for article in reversed(parser.articles):
get_article(article)I've found some time now to record the steps to make use of the above script.
1. Save the python code to the dl.py file
If you don't care about tagging the mp3 files, then
2. Remove lines 118-138
(otherwise things get even messier...)
In Google Chrome (I believe that other browsers allow for similar workflow as well)
1. Open "http://www3.nhk.or.jp/news/easy/index.html"
2. Select "11月26日(水)"
3. Right-click > Inspect Element on "「黒人を撃った警察官を訴えない」アメリカ中で抗議"
In Developer Tools:
1. Locate '<ul class="newslisteven">'
2. Right-click > Copy '<ul class="newslisteven">'
3. Save the Clipboard to the newslisteven.html file (this file has to be in the same directory as the dl.py script)
Command box part:
1. Open the command window
2. Change the current directory to the location where dl.py and newslisteven.html are
3. Execute 'python -u dl.py'
After execution of the script finishes you should see .txt and .mp3 files for 11月26日(水).
Edited: 2014-11-26, 5:20 am
