Ejemplo - Descarga de imágenes de wikipedia

El resultado HTML obtiene las primeras 10 imágenes disponibles enhttp://news.bbc.co.uk en.wikipedia.org. El resultado se actualiza cada 15 minutos.

 

Resultados:

Source URL;Image URL;Name
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/commons/thumb/7/73/Linden3cropped1.JPG/100px-Linden3cropped1.JPG";"100px-Linden3cropped1.JPG"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/commons/thumb/3/30/William_Fawcett_%281902%E2%80%931941%29.png/95px-William_Fawcett_%281902%E2%80%931941%29.png";"95px-William_Fawcett_%281902%E2%80%931941%29.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/commons/thumb/0/09/DrW.jpg/80px-DrW.jpg";"80px-DrW.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/commons/thumb/9/98/Bernadette_Soubirous.jpg/78px-Bernadette_Soubirous.jpg";"78px-Bernadette_Soubirous.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/commons/thumb/f/f8/Tramp_smoking_cigar_with_cane_over_arm_-_restoration.jpg/250px-Tramp_smoking_cigar_with_cane_over_arm_-_restoration.jpg";"250px-Tramp_smoking_cigar_with_cane_over_arm_-_restoration.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/9/9d/Commons-logo-31px.png";"Commons-logo-31px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/4/46/Wikiquote-logo-51px.png";"Wikiquote-logo-51px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/e/e3/Wikiversity-logo-41px.png";"Wikiversity-logo-41px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/7/7f/Wikibooks-logo-35px.png";"Wikibooks-logo-35px.png"
"http://en.wikipedia.org/wiki/Main_Page";"http://en.wikipedia.org/wiki/Main_Page///upload.wikimedia.org/wikipedia/en/b/b6/Wikisource-logo-35px.png";"Wikisource-logo-35px.png"
img-1img-2img-3img-4img-5img-6img-7img-8img-9img-10

Código fuente del script:

# File: imgdownloader_main.w
# Name: Image Downloader
# Description: Script opens defined URL, finds first 10 available images,
#              downloads them into folder 'images', saves basic information into CSV file
#              and makes html with downloaded images.
# Input: URL
# Output format: CSV file, images, HTML
# Output CSV fields: Source URL, Image URL, Name

#<Logger File>
#    Global
#    FileName imgdownloader_log.log
#    Level debug
#</Logger>

<Section>
    Name imgdownloader_main
	
    Define $output_file imgdownloader_output.csv
    Define $output_file2 imgdownloader_output.html
    Define $path http://www.qualityunit.com/fileadmin/scripts/imgdownloader/images/
	
    # clean output files and download dir
    <Action Print>
        FileName {$output_file}
        FileMode Write  
    </Action>
    <Action Print>
        FileName {$output_file2}
        FileMode Write  
    </Action>
    <Action Exec>
        cmd rm ./images/*.*
    </Action>
	
    Define $url http://en.wikipedia.org/wiki/Main_Page
	
    # downloading the content of URL
    <Action ContentURL>
        URL {$url} 
        RemoveNewLine
    </Action>
		
    <Section While>
        MaxIterations 10
		
        # two types of patterns
        <Section Or>
            NoContext
			
            <Pattern>
                RegExp <img{:re(.*?)}src="{$url_img:re([^"]*)}"
                Trim
                Compact
                MultiLine
            </Pattern>
			
            <Pattern>
                RegExp <img{:re(.*?)}src={$url_img:re([^ ]*)} 
                Trim
                Compact
                MultiLine
            </Pattern>

        </Section>
		
        # relative address -> absolute address
        <Action Php>
            Code if (!eregi("^http",$context->getVariable('$url_img')))\
                $context->setVariable('$url_img',parse_url($context->getVariable('$url'), PHP_URL_SCHEME)\
                ."://".parse_url($context->getVariable('$url'),PHP_URL_HOST)\
                .parse_url($context->getVariable('$url'),PHP_URL_PATH)."/".$context->getVariable('$url_img'));
        </Action>
		
        # image name
        <Action Php>
            Code $context->setVariable('$name_img',pathinfo($context->getVariable('$url_img'), PATHINFO_FILENAME)\
                .".".pathinfo($context->getVariable('$url_img'), PATHINFO_EXTENSION));
        </Action>
		
        # downloading image
        <Action URLToFile>
            URL {$url_img}
            FileName images/{$name_img}
        </Action>
		
        # saving basic information into SCV file
        <Action SaveCSV>
            FileName {$output_file}
            Separator ;
            Column $url, Source URL
            Column $url_img, Image URL
            Column $name_img, Name
        </Action>

        # add img to HTML
        <Action Print>
            FileName {$output_file2}
            Text <img src="{$path}{$name_img}" alt="img-{$_ITERATION}" title="image - {$_ITERATION}" />
        </Action>

    </Section>

</Section>

Main imgdownloader_main
Stay in touch with UnitMiner
© 2004-2012 QualityUnit.com, All rights reserved