Príklad - extrakcia obrázkov z wikipédie

[Translate to Slovak:] Do HTML sa extrahuje prvých 10 dostupných obrázkov zo stránky en.wikipedia.orghttp://www.wikipedia.org. Výstup sa obnovuje každých 15 minút.

 

Výstupy:

Source URL;Image URL;Name
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/thumb/9/9a/Salix_alaxensis_twig.jpg/100px-Salix_alaxensis_twig.jpg";"100px-Salix_alaxensis_twig.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/commons/thumb/a/af/Wislawa_Szymborska_Cracow_Poland_October23_2009_Fot_Mariusz_Kubik_01.jpg/75px-Wislawa_Szymborska_Cracow_Poland_October23_2009_Fot_Mariusz_Kubik_01.jpg";"75px-Wislawa_Szymborska_Cracow_Poland_October23_2009_Fot_Mariusz_Kubik_01.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/thumb/a/a2/LOWEMZONEfeb08.PNG/65px-LOWEMZONEfeb08.PNG";"65px-LOWEMZONEfeb08.PNG"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Clavaria_zollingeri_90973.jpg/280px-Clavaria_zollingeri_90973.jpg";"280px-Clavaria_zollingeri_90973.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/9/9d/Commons-logo-31px.png";"Commons-logo-31px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/4/46/Wikiquote-logo-51px.png";"Wikiquote-logo-51px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/e/e3/Wikiversity-logo-41px.png";"Wikiversity-logo-41px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/7/7f/Wikibooks-logo-35px.png";"Wikibooks-logo-35px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/b/b6/Wikisource-logo-35px.png";"Wikisource-logo-35px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/f/f2/Wiktionary-logo-51px.png";"Wiktionary-logo-51px.png"
img-1img-2img-3img-4img-5img-6img-7img-8img-9img-10

Zdrojový kód skriptu:

# File: imgdownloader_main.w
# Name: Image Downloader
# Description: Script opens defined URL, finds first 10 available images,
#              downloads them into folder 'images', saves basic information into CSV file
#              and makes html with downloaded images.
# Input: URL
# Output format: CSV file, images, HTML
# Output CSV fields: Source URL, Image URL, Name

#<Logger File>
#    Global
#    FileName imgdownloader_log.log
#    Level debug
#</Logger>

<Section>
    Name imgdownloader_main
	
    Define $output_file imgdownloader_output.csv
    Define $output_file2 imgdownloader_output.html
    Define $path http://www.qualityunit.com/fileadmin/scripts/imgdownloader/images/
	
    # clean output files and download dir
    <Action Print>
        FileName {$output_file}
        FileMode Write  
    </Action>
    <Action Print>
        FileName {$output_file2}
        FileMode Write  
    </Action>
    <Action Exec>
        cmd rm ./images/*.*
    </Action>
	
    Define $url http://en.wikipedia.org/wiki/Main_Page
	
    # downloading the content of URL
    <Action ContentURL>
        URL {$url} 
        RemoveNewLine
    </Action>
		
    <Section While>
        MaxIterations 10
		
        # two types of patterns
        <Section Or>
            NoContext
			
            <Pattern>
                RegExp <img{:re(.*?)}src="{$url_img:re([^"]*)}"
                Trim
                Compact
                MultiLine
            </Pattern>
			
            <Pattern>
                RegExp <img{:re(.*?)}src={$url_img:re([^ ]*)} 
                Trim
                Compact
                MultiLine
            </Pattern>

        </Section>
		
        # relative address -> absolute address
        <Action Php>
            Code if (!eregi("^http",$context->getVariable('$url_img')))\
                $context->setVariable('$url_img',parse_url($context->getVariable('$url'), PHP_URL_SCHEME)\
                ."://".parse_url($context->getVariable('$url'),PHP_URL_HOST)\
                .parse_url($context->getVariable('$url'),PHP_URL_PATH)."/".$context->getVariable('$url_img'));
        </Action>
		
        # image name
        <Action Php>
            Code $context->setVariable('$name_img',pathinfo($context->getVariable('$url_img'), PATHINFO_FILENAME)\
                .".".pathinfo($context->getVariable('$url_img'), PATHINFO_EXTENSION));
        </Action>
		
        # downloading image
        <Action URLToFile>
            URL {$url_img}
            FileName images/{$name_img}
        </Action>
		
        # saving basic information into SCV file
        <Action SaveCSV>
            FileName {$output_file}
            Separator ;
            Column $url, Source URL
            Column $url_img, Image URL
            Column $name_img, Name
        </Action>

        # add img to HTML
        <Action Print>
            FileName {$output_file2}
            Text <img src="{$path}{$name_img}" alt="img-{$_ITERATION}" title="image - {$_ITERATION}" />
        </Action>

    </Section>

</Section>

Main imgdownloader_main
Stay in touch with UnitMiner
© 2004-2012 QualityUnit.com, All rights reserved