<<  Unit Miner  <<  QualityUnit
 
Unit Miner - product box
Unit Miner Service

We can extract data for you!

Fast, reliable & affordable Send us your requirements

DATA RETRIEVED FOR BBC


EXTRACTION SCRIPT USED

<Section>
    Name main
    Define $output_file bbcnews-output.html

    # clean file
    <Action Print>
        FileName {$output_file}
        FileMode Write  
    </Action>
    
    # define variable $main_url and assign it value
    Define $main_url http://news.bbc.co.uk
    
    # load content
    <Action ContentURL>
        URL {$main_url}/2/hi/default.stm
        RemoveNewLine
        TagsToStrip br,nobr,b
    </Action>

    <Section>
        Name pattern-articles
    
        # match news
        <Pattern>
            # NOTE: {:re(\s*)}{$title}{:re(\s*)} trim whitespaces before 
            #       and after $title
            RegExp <div class="mvb">*<a href="{$link}" class="tsh">\
                  {:re(\s*)}{$title}{:re(\s*)}</a>*</div>*<div class="mvb">\
                  {:re(\s*)}{$desc}{:re(\s*)}</div>

        </Pattern>

        # print parsed data
        <Action Print>
            FileName {$output_file}
            Text <p><h1><a href="{$main_url}{$link}">{$title}</a></h1>\
                 </p>\n<p>{$desc}</p>\n\
                 <p><h1>Related articles:<h1></p>\n<p><ul>\n
        </Action>

        # find all newa-references
        <Section While>
            Optional
            Name news-references
            # match news referebces
            <Pattern>
                RegExp <div class="sabull">*\
                       <a href="{$link_url}" class="tsl">\
                       {:re(\s*)}{$link_title}{:re(\s*)}</a>*</div>

            </Pattern>

            <Action Print>
                FileName {$output_file}
                Text <li><a href="{$main_url}{$link_url}">{$link_title}\
                     </a></li>\n
            </Action>

        </Section>

        # print html footer
        <Action Print>
            FileName {$output_file}
            Text </ul><p>\n
        </Action>
    </Section>

</Section>

Main main