DATA RETRIEVED FOR BBC
EXTRACTION SCRIPT USED
<Section>
Name main
Define $output_file bbcnews-output.html
# clean file
<Action Print>
FileName {$output_file}
FileMode Write
</Action>
# define variable $main_url and assign it value
Define $main_url http://news.bbc.co.uk
# load content
<Action ContentURL>
URL {$main_url}/2/hi/default.stm
RemoveNewLine
TagsToStrip br,nobr,b
</Action>
<Section>
Name pattern-articles
# match news
<Pattern>
# NOTE: {:re(\s*)}{$title}{:re(\s*)} trim whitespaces before
# and after $title
RegExp <div class="mvb">*<a href="{$link}" class="tsh">\
{:re(\s*)}{$title}{:re(\s*)}</a>*</div>*<div class="mvb">\
{:re(\s*)}{$desc}{:re(\s*)}</div>
</Pattern>
# print parsed data
<Action Print>
FileName {$output_file}
Text <p><h1><a href="{$main_url}{$link}">{$title}</a></h1>\
</p>\n<p>{$desc}</p>\n\
<p><h1>Related articles:<h1></p>\n<p><ul>\n
</Action>
# find all newa-references
<Section While>
Optional
Name news-references
# match news referebces
<Pattern>
RegExp <div class="sabull">*\
<a href="{$link_url}" class="tsl">\
{:re(\s*)}{$link_title}{:re(\s*)}</a>*</div>
</Pattern>
<Action Print>
FileName {$output_file}
Text <li><a href="{$main_url}{$link_url}">{$link_title}\
</a></li>\n
</Action>
</Section>
# print html footer
<Action Print>
FileName {$output_file}
Text </ul><p>\n
</Action>
</Section>
</Section>
Main main