I came across this interesting data-set on London Data-store which “provides a 5% sample of all Oyster card journeys performed in a week during November 2009 on bus, Tube, DLR, TfL Rail and London Overground”.

The data is basically Origin to Destination (with a count). Here it is visualised for week-days:

TFL Passenger Visualisation (Week)

Week-end (it looks like the Jubilee and Victoria line die and are replaced by the Piccadilly):

TFL Passenger Visualisation (Week-End)

The whole project can be decomposed into two parts:

  1. Guess the routes taken between the start and finish stations – I thought that querying a transport API would not be very interesting and thus decided to create a network to simulate the routes (the weights for the network were then calibrated based on TFL’s journey planner).
  2. Break apart the simulated routes into small chunks connecting individual stations, aggregate, and then visualise.

Furthermore, the data came from three different sources:

(i) Manually created the edges for the network:edges

(ii) The Oyster Card travel data mentioned above (here is what it looks like after cleaning):input

(iii) Geo-location data for 266 tube-stations (this was prepared for another project)

Method:

The idea was to create an (undirected) network –

metro_w <- graph.data.frame(weights, dir=FALSE)

Which ended up looking like this:out

Here it is zoomed in:zoom

There are 266 stations which define a point of entry into and exit from the network; these then link to a tube-line associated with the station, which link to other tube-lines. From the above example one can see that if I wanted to travel from Ealing Broadway to Perivale I would go through:

Ealing Broadway
Ealing Broadway*Central Line
West Acton*Central Line
North Acton*Central Line
Hanger Lane*Central Line
Perivale*Central Line
Perivale

The Station to Station*Line connections were given a high-weight so that they were never optimal and only picked when forced (to leave and exit the network). Stations on the same line ended up with weight of 1 and changing lines I assigned a weight of 3 (5 in the case of underground connected passages) since people dislike walking.

Since I had the network set-up I decided to run a PageRank algorithm to investigate a question I had for years: what is the (second – obviously first is Kings X) most ‘important’ station (I don’t have DLR or Overground or Rail stations in this data-set):pagerank

I was surprised that Bank was 9th. Below is a picture from Wikipedia – perhaps Bank is 9th (despite linking to so many stations) because the stations linking to Bank are not very popular?758px-pageranks-example-svg

The O-D stations were then fed into this simulator which would then spit out the route using iGraph’s shortest_paths command. Finally the routes were split into segments which were sorted so that the ‘from’ station was always alphabetically before the ‘to’ station and then collapsed. Here are the most popular segments:seg

R-Script (with annotations to explain in more detail):

library(igraph)
library(assertthat)
setwd('.../7. TFL Oyster OD/')

##########################################
# PART 1 - TFL JOURNEY PLANNER
##########################################

##########################################
# [0:] Read-In Data
# Read in non-directional data (i.e. A linked to B but not B to A)
##########################################
df.edges <- read.csv("network_data_non_directional.csv", as.is = TRUE, header = TRUE)

##########################################
# [A:] Create within station but different line
# For example: Changing from Ealing Broadway Central to Ealing Broadway Picadilly
##########################################
# Extract unique list of origin_lines
origin_lines <- unique(df.edges[c("from", "line")])
# Concat together
origin_lines$stat_line <- paste(origin_lines$from,origin_lines$line,sep="*")
origin_lines <- origin_lines[c("from", "stat_line")]
#Sort
origin_lines <- origin_lines[order(origin_lines$from),]

# Extract unique list of destination_lines
to_lines <- unique(df.edges[c("to", "line")])
# Concat together
to_lines$stat_line <- paste(to_lines$to,to_lines$line,sep="*")
to_lines <- to_lines[c("to", "stat_line")]
#Sort
to_lines <- to_lines[order(to_lines$to),]

# Combine two data-sets
origin_lines_2 <- to_lines
to_lines_2 <- origin_lines
names(origin_lines_2) <- c("from", "stat_line")
names(to_lines_2) <- c("to", "stat_line")

origin_lines<- rbind(origin_lines, origin_lines_2)
rm(origin_lines_2)

to_lines <- rbind(to_lines, to_lines_2)
rm(to_lines_2)

# BOTH EQUAL
# CROSS with each other 
merged_lines <- merge(origin_lines, to_lines, by.x='from', by.y='to', all = TRUE)
merged_lines <- merged_lines[c("stat_line.x", "stat_line.y")]

# Remove obvious duplicates
merged_lines <- merged_lines[merged_lines$stat_line.x!=merged_lines$stat_line.y,]
assertthat::noNA(merged_lines)

# Remove directional duplicates
# Order so that from is alphabetically before to
alpha_sort <- (merged_lines$stat_line.y < merged_lines$stat_line.x)

merged_lines$f <- merged_lines$stat_line.x
merged_lines$f[alpha_sort] <- merged_lines$stat_line.y[alpha_sort]

merged_lines$t <- merged_lines$stat_line.x
merged_lines$t[!alpha_sort] <- merged_lines$stat_line.y[!alpha_sort]

merged_lines <- unique(merged_lines[c("f","t")])
assert_test <- (merged_lines$f <= merged_lines$t)
assertthat::are_equal(unique(assert_test),TRUE)
rm(assert_test)

names(merged_lines) <- c("from", "to")
# Dislike changing lines
# Trial-and-error optimised parameter
merged_lines$weight <- 3

##########################################
# [B:] Create across stations same line
##########################################
stations_across <- df.edges
stations_across$from <- paste(stations_across$from,stations_across$line,sep="*")
stations_across$to <- paste(stations_across$to,stations_across$line,sep="*")
stations_across <- stations_across[c("from", "to")]
# Connections on same line have a minimum weight of 1
# Ideally = number of seconds of journey time or metre length of route
stations_across$weight <- 1

##########################################
# [C:] Append both
##########################################
weights <- rbind(merged_lines,stations_across)
rm(merged_lines)
rm(stations_across)

# Connected underground we really dislike!
weights$weight[grep("Connected_Underground", weights$from)] <- 5
weights$weight[grep("Connected_Underground", weights$to)] <- 5

##########################################
# [D:] Add on original station names with high weight
# This provides a way to enter the network
# The network will then decide which line to take
##########################################
origin_lines_2 <- origin_lines
names(origin_lines_2) <- c("to", "stat_line")
station_line <- rbind(origin_lines_2, to_lines)
rm(origin_lines_2)
station_line <- unique(station_line)
names(station_line) <- c("from", "to")
# Very high so that never gets chosen (only for start and end)
station_line$weight <- 1000
rm(origin_lines)
rm(to_lines)


##########################################
# [E:] Append to create final
##########################################
weights <- weights[c("from", "to","weight")]
weights <- rbind(weights, station_line)
rm(station_line)

##############################################
# [F:] Create network object
##############################################

metro_w <- graph.data.frame(weights, dir=FALSE)

rm(alpha_sort)
rm(df.edges)
rm(weights)

# [A] Test out page-rank of network since we have the full network
library(dplyr)
library(printr)
pr <- page.rank(metro_w)$vector
df <- data.frame(PageRank = pr)
df$row <- row.names(df)
df <- arrange(df, desc(PageRank))
print(df[0:10,])
# Kings Cross, Baker Street, Waterloo, Paddington, Liv Pool,
# Embankment, Moorgate, Edgeware Road, Bank, Notting Hill Gate

# [B] Plot network
png(filename="org_network.png", height=4000, width=4000) 
par(mai=c(0,0,1,0))  # Size of margins
plot.igraph(metro_w,
            main = "London TFL Network",
            vertex.label=V(metro_w)$name,
            vertex.label.color="darkblue",
            vertex.label.font=2,
            vertex.label.cex=1,
            vertex.size=1.5,
            vertex.color="lightblue",
            edge.curved=TRUE,
            edge.color="grey")
dev.off()

# [C] Save
write.graph(metro_w, file="tfl_network")

##############################################
# [G:] Read in INPUT CSV
##############################################

# This contains a cleaned data-set with Origin-Destination journies:
# From | To | Count
# Split by week and week-end
#input_csv <- read.csv('TfL collapsed weekend clean.csv', header = TRUE, as.is = TRUE)
input_csv <- read.csv('TfL collapsed week clean.csv', header = TRUE, as.is = TRUE)

names(input_csv) <- c("from","to","count")
input_csv <- input_csv[c("from","to","count")]

##############################################
# [H:] Get results in a loop
##############################################

# Remove nonsense journies
input_csv <- input_csv[input_csv$from != input_csv$to,]
# Predict the full route for each OD journey using the network
# Create a list for each journey and then append all at once
comb <- list()

for (j in 1:nrow(input_csv)) {
  from_csv <- input_csv$from[j]
  to_csv <- input_csv$to[j]
  
  print(paste("Running no(", j, ") stations: ", from_csv, " to: ", to_csv,sep=""))
  results <- shortest_paths(metro_w, from_csv, to_csv)$vpath[[1]]
  
  # The data is split into a format so that we have (from, to) for a route:
  # A to B, then B to C, then C to D, etc.
  # The 'count' is the same in the group
  # Of course if passengers jumped out of a moving train this wouldn't hold
  results_df <- as.data.frame(V(metro_w)$name[results], stringsAsFactors = TRUE)
  names(results_df) <- c("from")
  print(results_df$from)
  results_df$id <- paste(results_df$from[1],results_df$from[nrow(results_df)],sep="/")
  results_df <- results_df[2:(nrow(results_df)-1),]
  
  to <- as.data.frame(results_df$from)
  to <- to[2:(nrow(to)),]
  
  comb_temp <- cbind(results_df[1:(nrow(results_df)-1),],to)
  comb_temp$count <- input_csv$count[j]
  comb[[j]] <- comb_temp[c("from", "to", "id", "count")]
}

# Use do.call() instead of a loop when appending R data-frames
# Faster than multiple rbinds; which copy the data over and over again
final.data <- do.call(rbind, comb)

#write.csv(final.data, 'routes_cs_out.csv', row.names = FALSE)

##########################################
# PART 2 - Map
##########################################

#input_csv <- read.csv('routes_cs_out.csv', as.is = TRUE)
input_csv <- final.data

# Remove line information
input_csv <- input_csv[c("from", "to", "count")]
# Remove the line from station*line
input_csv$from <- sub("\\*(.*)","",input_csv$from)
input_csv$to <- sub("\\*(.*)","",input_csv$to)

# De-duplicate (should not have any!)
de_dup <- input_csv$from != input_csv$to
input_csv <- input_csv[de_dup,]

##############################################
# [B:] Prepare data for plot
##############################################
# This contains the coordinates for the tube-stations
# Stations with more than one line have had the multiple overlapping point
# Spread-out into a circle and are plotted slightly differently
# For readability and are thus treated slightly differently
unq_stations <- read.csv("readin_unique_tube_stations.csv")
many_stations <- read.csv("readin_many_tube_stations.csv")

# Prepare master file (combine the two)
stations_for_merge <- unique(rbind(unq_stations[c("Station", "Longitude", "Latitude")],
                                   many_stations[c("Station", "Longitude", "Latitude")]))

##############################################
# Corrections for consistency
# Since we had three different data-sources:
# 1) Defining the tube network
# 2) Passenger count data
# 3) Geocoding data for tube-stations
# We could at worst have three different spellings for each station
# After a significant amount of cleaning we only these problem stations/inconsistencies
stations_for_merge$Station <- sub("London ","",stations_for_merge$Station)
stations_for_merge$Station[stations_for_merge$Station == "Bridge"] <- "London Bridge"
stations_for_merge <- stations_for_merge[order(stations_for_merge$Station),]

##############################################

# Merge (origin)
input_csv_orig <- merge(input_csv, stations_for_merge, by.x='from', by.y='Station', all.x = TRUE, all.y = FALSE)
assertthat::noNA(input_csv_orig)
input_csv_orig[!complete.cases(input_csv_orig),]
names(input_csv_orig) <- c("from", "to", "count", "orig.lon", "orig.lat")

# Merge (destination)
input_csv_dest <- merge(input_csv_orig, stations_for_merge, by.x='to', by.y='Station', all.x = TRUE, all.y = FALSE)
assertthat::noNA(input_csv_dest)
input_csv_dest[!complete.cases(input_csv_dest),]
names(input_csv_dest) <- c("from", "to", "count", "orig.lon", "orig.lat", "dest.lon", "dest.lat")

final_segments <- input_csv_dest
rm(input_csv_orig)
rm(input_csv_dest)

# Collapse (and arrange so that from is alphabetically before to)
library(plyr)

alpha_sort <- (final_segments$from > final_segments$to)

# FROM
final_segments$from2 <- final_segments$from
final_segments$from2[alpha_sort] <- final_segments$to[alpha_sort]

final_segments$orig.lon2 <- final_segments$orig.lon
final_segments$orig.lon2[alpha_sort] <- final_segments$dest.lon[alpha_sort]

final_segments$orig.lat2 <- final_segments$orig.lat
final_segments$orig.lat2[alpha_sort] <- final_segments$dest.lat[alpha_sort]

# TO
final_segments$to2 <- final_segments$from
final_segments$to2[!alpha_sort] <- final_segments$to[!alpha_sort]

final_segments$dest.lon2 <- final_segments$orig.lon
final_segments$dest.lon2[!alpha_sort] <- final_segments$dest.lon[!alpha_sort]

final_segments$dest.lat2 <- final_segments$orig.lat
final_segments$dest.lat2[!alpha_sort] <- final_segments$dest.lat[!alpha_sort]

final_segments <- final_segments[c("from2", "to2", "orig.lon2", "orig.lat2", "dest.lon2", "dest.lat2", "count")]
names(final_segments) <- c("from", "to", "orig.lon", "orig.lat", "dest.lon", "dest.lat", "count")

final_segments <- ddply(final_segments, c("from", "to", "orig.lon", "orig.lat", "dest.lon", "dest.lat"), summarize, count_col=sum(count)) 

##############################################
# [C:] PLot
##############################################

library(rgeos)
library(rgdal)
library(proto)
library(ggplot2)
library('devtools')
dev_mode()

#################################################################################
# Create GeomSegment2 function
# "GeomSegment2" Not written by me - I think spatial.ly?
# The function helps remove 'gaps' between segments compared to the original
#################################################################################
GeomSegment2 <- proto(ggplot2:::GeomSegment, {
  objname <- "geom_segment2"
  draw <- function(., data, scales, coordinates, arrow=NULL, ...) {
    if (is.linear(coordinates)) {
      return(with(coord_transform(coordinates, data, scales),
                  segmentsGrob(x, y, xend, yend, default.units = "native",
                               gp = gpar(col = alpha(colour, alpha), 
                                         lwd = size * .pt,
                                         lty = linetype, lineend = "round"),
                               arrow = arrow)))
    }
  }
})

geom_segment2 <- function(mapping = NULL, data = NULL, stat = "identity", 
                          position = "identity", arrow = NULL, ...) {
  GeomSegment2$new(mapping = mapping, data = data, stat = stat, 
                   position = position, arrow = arrow, ...)
}
#################################################################################

#Remove axis
xquiet<- scale_x_continuous("", breaks=NULL)
yquiet<-scale_y_continuous("", breaks=NULL)
quiet<-list(xquiet, yquiet)

# ADD Water (clipped from OS)
rivers_loc <- '.../os_vector_map_merged(IK)'
rivers <- readOGR(rivers_loc, layer="TQ_TidalWater_Merge_Clip") 
rivers <- spTransform(rivers, CRS("+proj=longlat +datum=WGS84"))
plot(rivers)

# # ADD Buildings (Clipped from OS)
# buildings_loc <- 'os_buildings_london'
# buildings <- readOGR(buildings_loc, layer="TQTL_LON") 
# buildings <- spTransform(buildings, CRS("+proj=longlat +datum=WGS84"))

# Plot 
ggplot() +
  
#   Buildings as background
#   geom_polygon(data=buildings, aes(long, lat, group=group),
#                color=NA, fill='#0C090A', size=1, alpha=0.3)+
  
  # Plot rivers
  geom_polygon(data=rivers, aes(long, lat, group=group),
               color=NA, fill='#000033', size=1, alpha=0.3)+
  
  # Plot segments
  geom_segment2(data=final_segments, aes(x=orig.lon, xend = dest.lon, y=orig.lat, yend=dest.lat, size=count_col, alpha=count_col), 
                color='white') +
  
  # Plot unique
  geom_point(data=unq_stations, aes(x=Longitude, y=Latitude, colour=Line),
             alpha=1, size=2)  +
  
  # Plot multiple (smaller size)
  geom_point(data=many_stations, aes(x=new_lng, y=new_lat, colour=Line),
             alpha=1, size=1)  +
  
  
  scale_size(range=c(0.02, 1.2)) +
  
  scale_alpha_continuous(range = c(0.2, 0.95)) +
  
  scale_color_manual(values=c(
    "District"="#00782A",
    "Piccadilly"="#003688",
    "Hammersmith & City"="#F3A9BB",
    "Circle"="#FFD300",
    "Metropolitan"="#9B0056",
    "Northern"="#000000",
    "Bakerloo"="#B36305",
    "Jubilee"="#A0A5A9",
    "Central"="#E32017",
    "Waterloo & City"="#95CDBA",
    "Victoria"="#0098D4")) +
  
  theme(panel.background = element_rect(fill='#2C3539',colour='#2C3539')) +
  
  #http://geepeeex.com/LongitudesAndLatitudes.htm
  quiet + coord_equal(ratio=122/78)

# Save
ggsave("tube_journies.png", width=20, height=10, dpi=400)