Diferencia entre revisiones de «LSWC scraping the web/presentacion lwsc 2011»
De WikiEducator
| Línea 24: | Línea 24: | ||
<div class="slide"> | <div class="slide"> | ||
| − | === === | + | === Búsqueda "bruta" === |
| + | <source lang="python"> | ||
| + | import urllib2 | ||
| + | URL = 'http://www.libresoftwareworldconference.com/' | ||
| + | source = urllib2.urlopen(URL).read() | ||
| + | </source> | ||
| + | * Proceso del texto | ||
| + | * Expresiones regulares | ||
| + | </div> | ||
| + | |||
| + | |||
| + | <div class="slide"> | ||
| + | === Librerías en Python === | ||
| + | * Beautiful Soup | ||
| + | * mechanize | ||
| + | * lxml | ||
| + | * html5lib | ||
| + | * scrapemark | ||
| + | * pyquery | ||
| + | * scrapy | ||
| + | ... | ||
</div> | </div> | ||
<div class="slide"> | <div class="slide"> | ||
| − | === | + | === amara === |
</div> | </div> | ||
