# WEEK 5: Mapping & Web Crawling

---

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go 

py.sign_in('USER NAME', 'API TOKEN')

## Map
We will learn two types of maps: scatter map and filled map. Scatter map is to show scattering points on the geo map while filled map is to show the value of a region by changing its color on the map.
<br>For more details: https://plot.ly/python/reference/#scattermapbox and https://plot.ly/python/reference/#choropleth

### 1. Scatter Map

We will rely on a built-in tool in plot.ly, named "mapbox". Mapbox is an independent IT company focusing on developing GIS-related service. It has connections with plot.ly, IBM, and Google to provide far-reaching and accessible tools in their platforms. In order to use it, you need to apply for its account: https://www.mapbox.com/

In [None]:
mapbox_token='YOUR TOKEN'

Besides, we need to use google map api to search for place's coordinates. So please go to google cloud platform: https://console.cloud.google.com/google/maps-apis and activate Place API.

<div class="alert alert-block alert-warning">
**<b>Reminder</b>** Free account only allows you to call Google Places API 2 times per minute and 1,000 times per day. These limits will be increased to 20 and 150,000 if you enable billing to verify your identity. </div>

In [None]:
#install googlemaps library
! pip3 install googlemaps

In [None]:
import googlemaps

place_api='YOUR TOKEN'

In [None]:
client=googlemaps.Client(key=place_api) #create a client variable with your api

In [126]:
univs=client.places('universities in hong kong') #search for some places

In [None]:
type(univs) #look into the search result. It's a dictionary.

In [None]:
univs.keys() #search results are stored with the key of "results"

In [None]:
#collect results on the first page
names=[] #initialize a blank list for storing university names
geos=[]
ratings=[]
for i in univs['results']: #go over every university and store its name, geolocation and rating into three blank lists respectively
    names.append(i['name'])
    geos.append(list(i['geometry']['location'].values()))
    ratings.append(i['rating'])

### Time control
When you are collecting data by computer program, you need to include time delay every now and then to:
1. Wait for the page to be loaded
2. Avoid yourself being detected as a bot or exceeding the rate limit

In [None]:
import time

In [None]:
time.sleep(5) #stop for 5 seconds

In [None]:
#-----------------------------
#define a reusable function to update current page information
def update_current_page(univs,names,geos,ratings):
    for i in univs['results']: #go over every university on current page
        names.append(i['name'])
        geos.append(list(i['geometry']['location'].values()))
        ratings.append(i['rating'])
    return(names,geos,ratings) #return updated lists
#-----------------------------

#-----------------------------
names=[]
geos=[]
ratings=[]
univs=client.places('universities in hong kong')
names,geos,ratings=update_current_page(univs,names,geos,ratings) #collect results from first page

#-----------------------------
#use while loop to navigate to next page if next page exists
page=1
while 'next_page_token' in univs.keys(): #if next_page_token is not blank
    time.sleep(30)
    page+=1
    print('go to next page:',page)
    next_page=univs['next_page_token']
    univs=client.places('universities in hong kong',page_token=next_page)
    names,geos,ratings=update_current_page(univs,names,geos,ratings)

print('DONE!')

In [None]:
#create a list of Scttermapbox objects. Each object stands for one scatter point on the map.
data=[]
for i in range(len(names)):
    trace=go.Scattermapbox(lat=[geos[i][0]],lon=[geos[i][1]],text=names[i],
                     marker={'size':ratings[i]*2})
    data.append(trace)

In [None]:
#update the layout
layout = go.Layout(
    mapbox={
           'accesstoken':mapbox_token,
           'style':'dark',
           'center':{'lat':geos[0][0],'lon':geos[0][1]},
           'zoom':10
    },
    showlegend=False
)

In [None]:
fig=go.Figure(data,layout)
py.iplot(fig,filename='map')

### 2. Filled Map
Fill regions on the map with certain colors to represent the statistics. This type of map has an academic name of "choropleth map".

In [None]:
import pandas as pd
freedom_table=pd.read_csv('https://juniorworld.github.io/python-workshop-2018/doc/human-freedom-index.csv')

In [None]:
freedom_table.head() #first column, i.e. iso contry code, can be used to create a map.

In [None]:
trace=go.Choropleth(
        locations=freedom_table['ISO_code'],
        z=freedom_table['human freedom'],
        text=freedom_table['countries']
        
)
py.iplot([trace],filename='map')

In [None]:
#change color scale
trace=go.Choropleth(
        locations=freedom_table['ISO_code'],
        z=freedom_table['human freedom'],
        text=freedom_table['countries'],
        colorscale='RdBu'
        
)
py.iplot([trace],filename='map')

In [None]:
#change the map design by redefining line setting in marker parameter
trace=go.Choropleth(
        locations=freedom_table['ISO_code'],
        z=freedom_table['human freedom'],
        text=freedom_table['countries'],
        colorscale='RdBu',
        marker={'line':{'color':'white','width':0.2}}
        
)

py.iplot([trace],filename='map')

In [None]:
#remove coastlines
trace=go.Choropleth(
        locations=freedom_table['ISO_code'],
        z=freedom_table['human freedom'],
        text=freedom_table['countries'],
        colorscale='RdBu',
        marker={'line':{'color':'white','width':0.2}}
        
)
layout=go.Layout(geo={'showcoastlines':False})
fig=go.Figure([trace],layout)
py.iplot(fig,filename='map')

In [None]:
#try other alternative types of projection in the map layout
#Alternative types: 'equirectangular', 'mercator', 'orthographic', 'natural earth', 'kavrayskiy7', 'miller', 'robinson',
#'eckert4', 'azimuthal equal area', 'azimuthal equidistant', 'conic equal area', 'conic conformal', 'conic equidistant', 
#'gnomonic', 'stereographic', 'mollweide', 'hammer', 'transverse mercator', 'albers usa', 'winkel tripel', 'aitoff', 'sinusoidal'
trace=go.Choropleth(
        locations=freedom_table['ISO_code'],
        z=freedom_table['human freedom'],
        text=freedom_table['countries'],
        colorscale='RdBu',
        marker={'line':{'color':'white','width':0.2}}
        
)
layout=go.Layout(geo={'projection':{'type':'orthographic'}})
fig=go.Figure([trace],layout)
py.iplot(fig,filename='map')

#### <font style="color: blue">Practice:</font>
---
<font style="color: blue">Please create a world map representing the GDP values of the countries recorded in freedom_table. The map should meet following requirements:<br>
    1. colorscale = Reds
    2. projection type: natural earth
</font>

In [None]:
#Write your code here






---

## Break

---

## Web Crawling
We will introduce two methods to collect data: web crawling (this week) and calling API (next week).<br>
Web crawling is to design an automatic bot to imitate human browsing behavior.

### Understanding HTML
- HTML stands for **Hyper Text Markup Language**, which is used to define a website.
- All HTML contents are hierarchical and structured.
    - Basic Element: `Tag` and `Text`
    - Text is the content shown on the screen. **Tag is not displayed but is used to render the text.**
    - Text is wrapped by start and end tags.
    - Tag: denoted by a pair of angle bracket <>
        - Start Tag
            - Tag Name
            - Attributes (optional): attributes provide additional information about the element
                - Attribute Name
                - Attribute Value
            - format: <...>
        - End Tag
            - format: </...>
        - All tags are used in pairs, <font style="color:red">except line break tag <b>&lt;br&gt;</b> and input box tag <b>&lt;input&gt;</b></font>.

---

### HTML in Jupyter Notebook
We can easily embed HTML in Juypter Notebook by turning the cell type into "Markdown". There are two ways to achieve this:<br>
1. Click <kbd>Code</kbd> in the tool bar -> select <kbd>Markdown</kbd> from the dropdown
2. Shortcut: <kbd>Esc</kbd>+<kbd>M</kbd>

### Create your first HTML file.
```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <p>Hello world!</p>
  </body>
</html>
```

_Now, copy and paste above lines into below cell and render it._

### Some Frequently Used Tags
- &lt;h1&gt; is to create a first-level heading.
    - HTML support overall 6 levels of headings. You can use &lt;h1&gt;, &lt;h2&gt;,&lt;h3&gt;, &lt;h4&gt;, &lt;h5&gt; and &lt;h6&gt; to create headings at different level.
- &lt;div&gt; is to create a **division/section**.
- &lt;p&gt; is to creat a **paragraph**.
- &lt;b&gt; is to creat a line in **bold**.
- &lt;a&gt; is to creat a line of **website address**.
- &lt;br&gt; is to insert a **line break**.
- &lt;font&gt; is to change the **font style**.
- &lt;input&gt; is to create an **input box**.
- &lt;button&gt; is to create a **button**.
- &lt;table&gt; is to create a **table**.
    - Child tags: &lt;th&gt; = table head; &lt;tr&gt; = table row; &lt;td&gt; = table data

```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <h1>This is a heading.</h1>
    <h2>This is a small heading.</h2>
    <p>This is a paragraph.</p>
  </body>
</html>```

```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <h1>This is a heading.</h1>
    <h2>This is a small heading.</h2>
    <p>This is a long long long long paragraph with <br> line break.</p>
  </body>
</html>```

### Style the text by redefining style attribute.
```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <p style="color: red">Hello world!</p>
  </body>
</html>```

_Color is just one of many properties under style attribute. You can try changing other properties like **font-size**, **background-color** and **text-align**._

```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <p style="color: red; font-size:10px;background-color:lightblue;text-align:center">Hello world!</p>
  </body>
</html>```

### Add hyperlink by using &lt;a&gt; tag.
Key attribute is `href`, standing for hypertext reference.

```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <p>Hello world!</p>
    <a href="https://juniorworld.github.io/python-workshop-2018/">Go to our Home Page</a>
  </body>
</html>```

### Add Input Box.

```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <a href="https://juniorworld.github.io/python-workshop-2018/">Go to our Home Page</a>
    <p>Please input your user name:</p>
    <input type="text">
  </body>
</html>```

### Try other types of Input Box.
Alternative types: text, password, submit, reset, radio, checkbox, color, date and range.

```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <a href="https://juniorworld.github.io/python-workshop-2018/">Go to our Home Page</a>
    <p>Please input your user name:</p>
    <input type="text">
    <p>Please input your password:</p>
    <input type="password">
    <br>
    <input type="submit">
    <input type="reset">
  </body>
</html>```

To assign default value, you can use `value` attribute.

```html
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <a href="https://juniorworld.github.io/python-workshop-2018/">Go to our Home Page</a>
    <p>Please input your user name:</p>
    <input type="text" value="junior">
    <p>Please input your password:</p>
    <input type="password" value="123">
    <br>
    <input type="submit">
    <input type="reset">
  </body>
</html>```

### Publish HTML page
Please save your HTML code as a file and rename it as "week5.html"
Double click to render the page at your local end.
If you have a server, then you can send this file to your server and publish it as a online web page.

#### <font style="color: blue">Practice:</font>
<font style="color: blue">Please create a page as the screen, save it as "week5_practice.html" and render it in your computer.</font>

---

## Break

---

## Web Crawling

We will use `selenium` package to collect data, which is applicable to both static and dynamic websites.<br>
Please download Chrome driver from this link: https://chromedriver.storage.googleapis.com/index.html?path=73.0.3683.20/

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [None]:
driver=webdriver.Chrome(executable_path='PATH OF CHROME EXE') #load the browser

In [None]:
driver.get('week5.html') #load your local html file

In [None]:
driver.title #print the title

## Locate Element by Xpath

We can locate elements by their relative/absolute paths in the file with additional hints about their tag name, attribute name, and attribute value.<br>
- Xpath is an expression of HTML element path
    - `/` is the sign of **absolute path**:
        - if used at the begining: this is a xpath starting from the root node
        - if used in the middle: refer to the element **at the next level**
            - i.e. xpath of &lt;body&gt; can be written as "html/body" or "/html/body". 
            - If you write "/body", system will pop up error message.
    - `//` is the sign of **relative path**: refer to any element that matches to the pattern no matter where they are.
        - i.e. xpath of &lt;body&gt; can be written as "//body"
    - `[@attribute name=attribute value]` we can include attribute into the matching pattern
        - i.e. "//input[@type='reset']"
        - The most efficient attribute is `id`. `id` is the unique identification of element.

In [None]:
#you can use find_element_by_xpath function to find the element by relative xpath
body=driver.find_element_by_xpath('//body')

In [None]:
body.text #get the text of the matched element

In [None]:
#or by absolute xpath
body=driver.find_element_by_xpath('/html/body')
print(body.text)

In [None]:
#use find_elements_by_xpath function to find a list of elements with shared pattern
inputs=driver.find_elements_by_xpath('//input')

In [None]:
len(inputs)

In [None]:
first_input=inputs[0]
print(first_input.get_attribute('value'))

In [None]:
ps=driver.find_elements_by_xpath('//p')

In [None]:
print(len(ps)) #count how many <p> are in the html
print(ps[0].text) #first element's text
print(ps[1].text) #second element's text

## Imitate Browsing Behavior

Some frequently used behaviors:
1. Click: `element.click()`
2. Type: `element.send_keys('something')`
3. Clear existing content: `element.clear()`
4. Scroll: 
    - Scroll to bottom: `driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")`
    - Scroll to specific location: i.e. scroll down by 400px, `driver.execute_script("window.scrollTo(0, 400);")`

In [None]:
#clean default name and fill in your name
name_box=inputs[0]
name_box.clear()
name_box.send_keys('your name')

In [None]:
#clean default password and fill in any random keys





In [None]:
#click the link of "GO to our Home Page"
link=driver.find_element_by_xpath('//a')
link.click()

In [None]:
#navigate to another online page and inspect the page
driver.get('https://juniorworld.github.io/python-workshop-2018/week5/1.html')

In [None]:
#copy the xpath and fill it into the bracket
Q1=driver.find_element_by_xpath('')
print(Q1.text)
Q2=driver.find_element_by_xpath('')
print(Q2.text)

In [None]:
#click the submit button
submit=driver.find_element_by_xpath('') #copy the xpath from inspect window will not look into attributes other than id
submit=driver.find_element_by_xpath('//input[@type="submit"]') #or you can specify xpath by yourself
submit.click()

#### <font style="color: blue">Practice:</font>
<font style="color: blue">Open Google page (https://www.google.com/), search for "JMSC" and click the "Google Search" button.</font>

In [None]:
#write your code here






In [None]:
#collect all results on the first page
results=driver.find_elements_by_xpath('//div[@class="rc"]')

In [None]:
#how many results are listed on the first page
len(results)

In [None]:
#print every result
for result in results:
    result_link=result.find_element_by_xpath('div[@class="r"]/a') #we can also find element under current note
    result_link_text=result_link.find_element_by_xpath('h3').text
    result_link_href=result_link.get_attribute('href')
    result_description=result.find_element_by_xpath('div[@class="s"]').text
    print(result_link_text,result_link_href,result_description)

In [None]:
#save results
output_file=open('week5_google.txt','w',encoding='utf-8')
for result in results:
    result_link=result.find_element_by_xpath('div[@class="r"]/a') #we can also find element under current note
    result_link_text=result_link.find_element_by_xpath('h3').text
    result_link_href=result_link.get_attribute('href')
    result_description=result.find_element_by_xpath('div[@class="s"]').text
    output_file.write(result_link_text+'\t'+result_link_href+'\t'+result_description+'\n')
output_file.close()