From 144565488f226d4c2b07506eb2132ae3e58d1d33 Mon Sep 17 00:00:00 2001 From: jbagnatoMacPro Date: Tue, 29 Jan 2019 10:41:49 +0100 Subject: [PATCH] agrego ejemplo para hacer webscraping obtener valores de la bolsa de madrid y de resultados de futbol con beautifulsoap --- Ejemplo_WebScraping_Bolsa_y_Futbol.ipynb | 630 +++++++++++++++++++++++ 1 file changed, 630 insertions(+) create mode 100644 Ejemplo_WebScraping_Bolsa_y_Futbol.ipynb diff --git a/Ejemplo_WebScraping_Bolsa_y_Futbol.ipynb b/Ejemplo_WebScraping_Bolsa_y_Futbol.ipynb new file mode 100644 index 000000000..ad0b7f13e --- /dev/null +++ b/Ejemplo_WebScraping_Bolsa_y_Futbol.ipynb @@ -0,0 +1,630 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ejemplo de WebScraping con Python\n", + "## Obtener Ibex35 bolsa de Madrid" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T17:53:23.662759Z", + "start_time": "2019-01-27T17:53:23.651938Z" + } + }, + "outputs": [], + "source": [ + "# import libraries\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import csv\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T17:38:57.659158Z", + "start_time": "2019-01-27T17:38:57.655785Z" + } + }, + "outputs": [], + "source": [ + "# indicar la ruta\n", + "url_page = 'http://www.bolsamadrid.es/esp/aspx/Indices/Resumen.aspx'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T17:40:11.127856Z", + "start_time": "2019-01-27T17:40:10.642601Z" + } + }, + "outputs": [], + "source": [ + "# tarda 480 milisegundos\n", + "page = requests.get(url_page).text \n", + "soup = BeautifulSoup(page, \"lxml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T17:42:00.515429Z", + "start_time": "2019-01-27T17:42:00.504178Z" + } + }, + "outputs": [], + "source": [ + "# Obtenemos la tabla por un ID específico\n", + "tabla = soup.find('table', attrs={'id': 'ctl00_Contenido_tblÍndices'})\n", + "tabla" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T17:55:08.443093Z", + "start_time": "2019-01-27T17:55:08.436365Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indice: IBEX 35®\n", + "Valor: 9.185,20\n" + ] + } + ], + "source": [ + "name=\"\"\n", + "price=\"\"\n", + "nroFila=0\n", + "for fila in tabla.find_all(\"tr\"):\n", + " if nroFila==1:\n", + " nroCelda=0\n", + " for celda in fila.find_all('td'):\n", + " if nroCelda==0:\n", + " name=celda.text\n", + " print(\"Indice:\", name)\n", + " if nroCelda==2:\n", + " price=celda.text\n", + " print(\"Valor:\", price)\n", + " nroCelda=nroCelda+1\n", + " nroFila=nroFila+1" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T17:56:28.279786Z", + "start_time": "2019-01-27T17:56:28.271938Z" + } + }, + "outputs": [], + "source": [ + "# Abrimos el csv con append para que pueda agregar contenidos al final del archivo\n", + "with open('bolsa_ibex35.csv', 'a') as csv_file:\n", + " writer = csv.writer(csv_file)\n", + " writer.writerow([name, price, datetime.now()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Obtener resultados de Futbol\n", + "## Ejemplo Liga BBVA - España - Primera - desde marcadores.com" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T18:26:04.881325Z", + "start_time": "2019-01-27T18:26:04.877803Z" + } + }, + "outputs": [], + "source": [ + "url_page = 'https://www.marcadores.com/futbol/espana/liga-bbva/?competitionRoundId=486942' # jornada 20" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T18:26:07.064454Z", + "start_time": "2019-01-27T18:26:05.493551Z" + } + }, + "outputs": [], + "source": [ + "# tarda 1500 milisegundos\n", + "page = requests.get(url_page).text \n", + "soup = BeautifulSoup(page, \"lxml\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T18:31:00.402982Z", + "start_time": "2019-01-27T18:31:00.299063Z" + } + }, + "outputs": [], + "source": [ + "# Obtenemos la tabla por un ID específico\n", + "tabla = soup.find('table', attrs={'class': 'matches'})\n", + "tabla" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T18:33:29.634584Z", + "start_time": "2019-01-27T18:33:29.621584Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Partido: Getafe vs Alavés (1 - 0)\n", + "Partido: Real Madrid vs Sevilla (0 - 0)\n", + "Partido: Huesca vs Atlético de Madrid (0 - 1)\n", + "Partido: Celta vs Valencia (1 - 0)\n", + "Partido: Betis vs Girona (1 - 2)\n", + "Partido: Villarreal vs Athletic Bilbao (0 - 1)\n", + "Partido: Rayo Vallecano vs Real Sociedad (2 - 1)\n", + "Partido: Levante vs Valladolid (1 - 0)\n", + "Partido: Barcelona vs Leganés (1 - 0)\n", + "Partido: Eibar vs Espanyol (1 - 0)\n" + ] + } + ], + "source": [ + "data = []\n", + "equipo1=\"\"\n", + "equipo2=\"\"\n", + "resultado=\"\"\n", + "nroFila=0\n", + "for fila in tabla.find_all(\"tr\"):\n", + " if nroFila>0:\n", + " nroCelda=0\n", + " capturar=False\n", + " for celda in fila.find_all('td'):\n", + " if nroCelda==1 and celda.text=='Fin.':\n", + " capturar=True\n", + " if capturar and nroCelda==2:\n", + " equipo1=celda.text\n", + " if capturar and nroCelda==4:\n", + " equipo2=celda.text\n", + " if capturar and nroCelda==5:\n", + " resultado=celda.text\n", + " print(\"Partido:\", equipo1,'vs',equipo2,resultado)\n", + " data.append((equipo1,equipo2,resultado))\n", + " nroCelda=nroCelda+1\n", + " nroFila=nroFila+1" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-27T18:34:52.245736Z", + "start_time": "2019-01-27T18:34:52.233185Z" + } + }, + "outputs": [], + "source": [ + "# Abrimos el csv con append para que pueda agregar contenidos al final del archivo\n", + "with open('partidos_liga_primera.csv', 'a') as csv_file:\n", + " writer = csv.writer(csv_file)\n", + " for equipo1, equipo2,resultado in data:\n", + " writer.writerow([equipo1, equipo2, resultado,datetime.now()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Otros ejemplos de WebScaping" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:37.428233Z", + "start_time": "2019-01-28T18:12:37.420625Z" + } + }, + "outputs": [], + "source": [ + "#supongamos tenemos el siguiente HTML\n", + "pagina_web = \"\" \\\n", + " + \"\" \\\n", + " + \"\" \\\n", + " + \"
\" \\\n", + " + \"
\" \\\n", + " + \"Bienvenido a mi web\" \\\n", + " + \"
\" \\\n", + " + \"
\" \\\n", + " + \"\" \\\n", + " + \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:38.100550Z", + "start_time": "2019-01-28T18:12:38.096154Z" + } + }, + "outputs": [], + "source": [ + "soup = BeautifulSoup(pagina_web, \"lxml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:38.677469Z", + "start_time": "2019-01-28T18:12:38.670578Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Bienvenido a mi web'" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Obtener por ID:\n", + "elTexto = soup.find('div', attrs={'id': '123'}).getText()\n", + "print(elTexto)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:39.718413Z", + "start_time": "2019-01-28T18:12:39.712544Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Bienvenido a mi web'" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Obtener por Clase CSS:\n", + "elTexto = soup.find('div', attrs={'class': 'verde'}).getText()\n", + "print(elTexto)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:40.673362Z", + "start_time": "2019-01-28T18:12:40.667391Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Bienvenido a mi web'" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Obtener dentro de otra etiqueta anidado:\n", + "elTexto = next(soup.div.children).getText() #con next obtiene primer \"hijo\"\n", + "print(elTexto)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Obtener items de un listado" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:42.573969Z", + "start_time": "2019-01-28T18:12:42.567849Z" + } + }, + "outputs": [], + "source": [ + "#supongamos tenemos el siguiente HTML\n", + "pagina_web = \"\" \\\n", + " + \"\" \\\n", + " + \"\" \\\n", + " + \"
\" \\\n", + " + \"\" \\\n", + " + \"
\" \\\n", + " + \"\" \\\n", + " + \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:43.383058Z", + "start_time": "2019-01-28T18:12:43.378520Z" + } + }, + "outputs": [], + "source": [ + "soup = BeautifulSoup(pagina_web, \"lxml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:44.100260Z", + "start_time": "2019-01-28T18:12:44.094916Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Perro\n", + "Gato\n", + "Tortuga\n" + ] + } + ], + "source": [ + "for child in soup.ul.children:\n", + " print(child.getText())" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-28T18:12:44.972925Z", + "start_time": "2019-01-28T18:12:44.967460Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Perro\n", + "Gato\n", + "Tortuga\n" + ] + } + ], + "source": [ + "items = soup.find_all('li')\n", + "for item in items:\n", + " print(item.getText())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Obtener Enlaces" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-29T09:25:01.595660Z", + "start_time": "2019-01-29T09:25:01.576485Z" + } + }, + "outputs": [], + "source": [ + "#supongamos tenemos el siguiente HTML\n", + "pagina_web = \"\" \\\n", + " + \"\" \\\n", + " + \"\" \\\n", + " + \"
\" \\\n", + " + \"\" \\\n", + " + \"
\" \\\n", + " + \"\" \\\n", + " + \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-29T09:25:04.281499Z", + "start_time": "2019-01-29T09:25:04.250643Z" + } + }, + "outputs": [], + "source": [ + "soup = BeautifulSoup(pagina_web, \"lxml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-29T09:25:43.006626Z", + "start_time": "2019-01-29T09:25:42.996092Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://www.google.com\n", + "http://www.yahoo.com\n", + "http://www.bing.com\n" + ] + } + ], + "source": [ + "items = soup.find_all('a')\n", + "for item in items:\n", + " print(item['href'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ejemplo completo Extraer enlaces" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-29T09:37:57.684739Z", + "start_time": "2019-01-29T09:37:57.069538Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://www.lifeder.com/personajes-historicos/\n", + "https://www.lifeder.com/frases-de-albert-einstein/\n", + "https://www.lifeder.com/aportaciones-isaac-newton/\n", + "https://www.lifeder.com/frases-de-isaac-newton/\n", + "https://www.lifeder.com/frases-de-stephen-hawking/\n", + "https://www.lifeder.com/mujeres-famosas-historia/\n", + "https://www.lifeder.com/aportaciones-galileo-galilei/\n", + "https://www.lifeder.com/frases-de-galileo-galilei/\n", + "https://www.lifeder.com/frases-de-charles-darwin/\n", + "https://www.lifeder.com/aportaciones-kepler/\n", + "https://www.lifeder.com/frases-de-thomas-edison/\n", + "https://www.lifeder.com/frases-de-arquimedes/\n", + "https://www.lifeder.com/frases-de-leonardo-da-vinci/\n", + "https://www.lifeder.com/aportaciones-john-dalton/\n", + "https://www.lifeder.com/daltonismo/\n", + "https://www.lifeder.com/frases-de-rene-descartes/\n" + ] + } + ], + "source": [ + "url_page = 'https://www.lifeder.com/cientificos-famosos/'\n", + "page = requests.get(url_page).text \n", + "soup = BeautifulSoup(page, \"lxml\")\n", + "contenido = soup.find('div', attrs={'class': 'td-post-content'})\n", + "items = contenido.find_all('a')\n", + "for item in items:\n", + " print(item['href'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "El artículo completo en www.aprendemachinelearning.com" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}