ancor_extracor.py

import urllib
from sgmllib import SGMLParser


class URLLister(SGMLParser):
    def reset(self):                              
        SGMLParser.reset(self)
        self.urls = []
        
    def start_a(self, attrs):                     
        
        href = [v for k, v in attrs if k=='href'] 
        if href:
            self.urls.extend(href)
            
            
if __name__ == "__main__":

    page = urllib.urlopen("http://www.ke.informatik.tu-darmstadt.de/lehre/ss08/web-mining/uebungen.html")
    
    extractor = URLLister()
    extractor.feed(page.read())
    extractor.close()
    
    for u in extractor.urls:
        pass
        print u
A A A | Drucken | Impressum | Sitemap | Suche | Mobile Version
zum Seitenanfangzum Seitenanfang