For Github version follow link
This report analyzes the San Francisco bike share network using data from two datasets:
The goals are to evaluate the evolution of the network between 2014 and 2015 and identify critical stations or routes.
stations = read.csv("https://raw.githubusercontent.com/thuankhang/casestudy-sf-bikeshare/refs/heads/main/data/SF-bikeshare-station-info.csv")
trips_2014 = read.csv("https://raw.githubusercontent.com/thuankhang/casestudy-sf-bikeshare/refs/heads/main/data/SF-bikeshare-1-week-2014-07.csv")
trips_2015 = read.csv("https://raw.githubusercontent.com/thuankhang/casestudy-sf-bikeshare/refs/heads/main/data/SF-bikeshare-1-week-2015-07.csv")
We have 3 datasets:
stations
:str(stations)
## 'data.frame': 70 obs. of 7 variables:
## $ id : int 2 3 4 5 6 7 8 9 10 11 ...
## $ name : chr "San Jose Diridon Caltrain Station" "San Jose Civic Center" "Santa Clara at Almaden" "Adobe on Almaden" ...
## $ lat : num 37.3 37.3 37.3 37.3 37.3 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ dock_count : int 27 15 11 19 15 15 15 15 15 19 ...
## $ city : chr "San Jose" "San Jose" "San Jose" "San Jose" ...
## $ installation_date: chr "8/6/2013" "8/5/2013" "8/6/2013" "8/5/2013" ...
trips_2014
:str(trips_2014)
## 'data.frame': 6911 obs. of 7 variables:
## $ start_date_yyyymmdd: chr "2014-07-13" "2014-07-13" "2014-07-13" "2014-07-13" ...
## $ start_station_name : chr "Powell at Post (Union Square)" "Market at 4th" "Market at 4th" "Grant Avenue at Columbus Avenue" ...
## $ start_station_id : int 71 76 76 73 50 2 61 75 28 71 ...
## $ end_date_yyyymmdd : chr "2014-07-13" "2014-07-13" "2014-07-13" "2014-07-13" ...
## $ end_station_name : chr "Embarcadero at Bryant" "Market at 10th" "Market at 10th" "Powell at Post (Union Square)" ...
## $ end_station_id : int 54 67 67 71 63 4 54 57 32 39 ...
## $ duration : int 667 401 401 470 421 221 233 455 559 1386 ...
trips_2015
:str(trips_2015)
## 'data.frame': 7381 obs. of 7 variables:
## $ start_date_yyyymmdd: chr "2015-07-12" "2015-07-12" "2015-07-12" "2015-07-12" ...
## $ start_station_name : chr "Howard at 2nd" "Temporary Transbay Terminal (Howard at Beale)" "San Jose City Hall" "Clay at Battery" ...
## $ start_station_id : int 63 55 10 41 77 42 16 16 76 50 ...
## $ end_date_yyyymmdd : chr "2015-07-12" "2015-07-12" "2015-07-12" "2015-07-12" ...
## $ end_station_name : chr "Market at Sansome" "Powell Street BART" "SJSU - San Salvador at 9th" "Washington at Kearny" ...
## $ end_station_id : int 77 39 16 46 73 49 10 16 69 56 ...
## $ duration : int 121 444 444 166 624 363 570 82 402 5868 ...
Let’s get the summary of trips_2014
and
trips_2015
:
summary(trips_2014)
## start_date_yyyymmdd start_station_name start_station_id end_date_yyyymmdd
## Length:6911 Length:6911 Min. : 2.00 Length:6911
## Class :character Class :character 1st Qu.:50.00 Class :character
## Mode :character Mode :character Median :62.00 Mode :character
## Mean :57.63
## 3rd Qu.:70.00
## Max. :84.00
## end_station_name end_station_id duration
## Length:6911 Min. : 2.00 Min. : 61
## Class :character 1st Qu.:49.00 1st Qu.: 348
## Mode :character Median :61.00 Median : 520
## Mean :57.23 Mean : 1158
## 3rd Qu.:70.00 3rd Qu.: 753
## Max. :84.00 Max. :715339
summary(trips_2015)
## start_date_yyyymmdd start_station_name start_station_id end_date_yyyymmdd
## Length:7381 Length:7381 Min. : 2.00 Length:7381
## Class :character Class :character 1st Qu.:50.00 Class :character
## Mode :character Mode :character Median :61.00 Mode :character
## Mean :57.65
## 3rd Qu.:70.00
## Max. :84.00
## end_station_name end_station_id duration
## Length:7381 Min. : 2.00 Min. : 61
## Class :character 1st Qu.:50.00 1st Qu.: 356
## Mode :character Median :61.00 Median : 531
## Mean :57.69 Mean : 1198
## 3rd Qu.:70.00 3rd Qu.: 768
## Max. :84.00 Max. :1133540
sum(is.na(stations))
## [1] 0
sum(is.na(trips_2014))
## [1] 0
sum(is.na(trips_2015))
## [1] 0
There is no missing(NA) values in any of the datasets.
Standardize the datasets for consistent analysis.
Add columns for start city, end city, and trip between cities.
trips_2014$start_city = stations$city[match(trips_2014$start_station_id, stations$id)]
trips_2014$end_city = stations$city[match(trips_2014$end_station_id, stations$id)]
trips_2014$trip_between_cities = paste(trips_2014$start_city, "->", trips_2014$end_city)
trips_2015$start_city = stations$city[match(trips_2015$start_station_id, stations$id)]
trips_2015$end_city = stations$city[match(trips_2015$end_station_id, stations$id)]
trips_2015$trip_between_cities = paste(trips_2015$start_city, "->", trips_2015$end_city)
Add latitude and longitude information for each station.
trips_2014$start_station_lat = stations$lat[match(trips_2014$start_station_id, stations$id)]
trips_2014$start_station_long = stations$long[match(trips_2014$start_station_id, stations$id)]
trips_2014$end_station_lat = stations$lat[match(trips_2014$end_station_id, stations$id)]
trips_2014$end_station_long = stations$long[match(trips_2014$end_station_id, stations$id)]
trips_2015$start_station_lat = stations$lat[match(trips_2015$start_station_id, stations$id)]
trips_2015$start_station_long = stations$long[match(trips_2015$start_station_id, stations$id)]
trips_2015$end_station_lat = stations$lat[match(trips_2015$end_station_id, stations$id)]
trips_2015$end_station_long = stations$long[match(trips_2015$end_station_id, stations$id)]
We will find number of unique stations in July each year.
stations_2014 = unique(c(trips_2014$start_station_id, trips_2014$end_station_id))
stations_2015 = unique(c(trips_2015$start_station_id, trips_2015$end_station_id))
There are more stations on July 2015, so we will find new stations and removed stations (if have) over the year:
new_stations = setdiff(stations_2015, stations_2014)
removed_stations = setdiff(stations_2014, stations_2015)
First, we will need to find out how many cities are there, and what are those for later using.
city_2014 = unique(stations$city[stations$id %in% unique(union(trips_2014$start_station_id, trips_2014$end_station_id))])
city_2015 = unique(stations$city[stations$id %in% unique(union(trips_2015$start_station_id, trips_2015$end_station_id))])
There are in total 5 cities in dataset of July 2014 and 5 cities in dataset of July 2015
stations_2014_count = table(stations$city[stations$id %in% stations_2014])
stations_2014_count
##
## Mountain View Palo Alto Redwood City San Francisco San Jose
## 7 5 6 35 16
write.csv(stations_2014_count, "num_stations_city_2014.csv")
stations_2015_count = table(stations$city[stations$id %in% stations_2015])
stations_2015_count
##
## Mountain View Palo Alto Redwood City San Francisco San Jose
## 7 5 7 35 16
Insights
- As we found before, there are one more stations, and new station Redwood City Medical Center is in the Redwood City.
- Number of stations on each city still remains, except for Redwood City.
- San Francisco has the most number of stations, while Palo Alto has the least. We can make a prediction that the connectivity and volume in San Francisco will outweighted the others.
pie(stations_2014_count,
main = "Pie Chart of Stations in Cities - July 2014",
col = rainbow(length(stations_2014_count)),
radius = 1)
pie(stations_2015_count,
main = "Pie Chart of Stations in Cities - July 2015",
col = rainbow(length(stations_2015_count)),
radius = 1)
stations_count = data.frame(
City = rep(names(stations_2014_count), 2),
Year = rep(c(2014, 2015), each = length(stations_2014_count)),
Count = c(as.vector(stations_2014_count), as.vector(stations_2015_count))
)
ggplot(stations_count, aes(x = City, y = Count, fill = factor(Year))) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "City", y = "Number of Stations", fill = "Year") +
ggtitle("Number of Stations by City in July 2014 and 2015") +
theme_minimal()
Since the different between datasets in 2014 and 2015 is not huge enough, I will plot only graph for the year 2014.
First, get dataframe of station start, end and vol.
stations_graphing_2014 = as.data.frame(table(trips_2014$start_station_id, trips_2014$end_station_id))
colnames(stations_graphing_2014) = c("Start", "End", "Vol")
Then, make graph using graph_from_data_frame
graph_2014 = graph_from_data_frame(stations_graphing_2014)
long_2014 = c()
lat_2014 = c()
for (station_id in V(graph_2014)$name) {
station_coords = stations[stations$id == station_id,]
if (nrow(station_coords) > 0) {
long_2014 = c(long_2014, station_coords$long)
lat_2014 = c(lat_2014, station_coords$lat)
} else {
long_2014 = c(long_2014, NA)
lat_2014 = c(lat_2014, NA)
}
}
V(graph_2014)$x = long_2014
V(graph_2014)$y = lat_2014
V(graph_2014)$size = degree(graph_2014, mode = "out")*0.1
And plotting:
USA <- geodata::gadm(country='USA', level=2, path = ".")
cities_map <- USA[USA$NAME_1 %in% "California",]
plot(cities_map,
xlim = c(-122.45, -121.875),
ylim = c(37.325, 37.825))
plot(graph_2014, add = TRUE, rescale = FALSE,
edge.arrow.size = 0.01, edge.arrow.width = 0.75)
It is too difficult to see, so my first plan is I will delve into each city. But I cannot plot other city (I don’t know why) so I will plot only San Francisco.
The code are the same so I will skip explaining them.
stations_graphing_2014 = as.data.frame(table(trips_2014$start_station_id[trips_2014$start_city == "San Francisco" & trips_2014$end_city == "San Francisco"], trips_2014$end_station_id[trips_2014$start_city == "San Francisco" & trips_2014$end_city == "San Francisco"]))
colnames(stations_graphing_2014) = c("Start", "End", "Vol")
graph_2014 = graph_from_data_frame(stations_graphing_2014)
long_2014 = c()
lat_2014 = c()
for (station_id in V(graph_2014)$name) {
station_coords = stations[stations$id == station_id,]
if (nrow(station_coords) > 0) {
long_2014 = c(long_2014, station_coords$long)
lat_2014 = c(lat_2014, station_coords$lat)
} else {
long_2014 = c(long_2014, NA)
lat_2014 = c(lat_2014, NA)
}
}
V(graph_2014)$x = long_2014
V(graph_2014)$y = lat_2014
V(graph_2014)$size = degree(graph_2014, mode = "out")*0.01
USA <- geodata::gadm(country='USA', level=2, path = ".")
map <- USA[USA$NAME_2 == "San Francisco",]
plot(map,
xlim = c(-122.425, -122.375),
ylim = c(37.77, 37.81),
main = "Graph in San Francisco")
plot(graph_2014, add = TRUE, rescale = FALSE,
edge.arrow.size = 0.01, edge.arrow.width = 0.75)
Insights
- Total Trips: Increased from 6,911 in 2014 to 7,381 in 2015, reflecting greater bike-share usage.
- Total Trip Durations: Grew from 2,223.13 hours in 2014 to 2,457.26 hours in 2015, suggesting longer or more frequent trips.
- Average Trip Duration: Increased slightly from 1,158.05 seconds in 2014 to 1,198.5 seconds in 2015, indicating a trend toward longer rides.
table_trips_2014 = as.data.frame(table(trips_2014$trip_between_cities))
colnames(table_trips_2014) = c("trip_between_cities_2014", "num_trips")
duration_2014 = c()
for (i in table_trips_2014$trip_between_cities_2014) {
duration_2014 = c(duration_2014, sum(trips_2014$duration[trips_2014$trip_between_cities %in% i]))
}
table_trips_2014$duration = duration_2014
table_trips_2014$avg_duration = round(table_trips_2014$duration / table_trips_2014$num_trips, 2)
table_trips_2014
## trip_between_cities_2014 num_trips duration avg_duration
## 1 Mountain View -> Mountain View 260 518743 1995.17
## 2 Mountain View -> Palo Alto 6 11596 1932.67
## 3 Palo Alto -> Mountain View 6 8959 1493.17
## 4 Palo Alto -> Palo Alto 86 144850 1684.30
## 5 Redwood City -> Palo Alto 2 6829 3414.50
## 6 Redwood City -> Redwood City 27 15796 585.04
## 7 San Francisco -> San Francisco 6071 5867541 966.49
## 8 San Jose -> San Jose 453 1428963 3154.44
table_trips_2014$year = rep(2014, dim(table_trips_2014)[1])
table_trips_2014
## trip_between_cities_2014 num_trips duration avg_duration year
## 1 Mountain View -> Mountain View 260 518743 1995.17 2014
## 2 Mountain View -> Palo Alto 6 11596 1932.67 2014
## 3 Palo Alto -> Mountain View 6 8959 1493.17 2014
## 4 Palo Alto -> Palo Alto 86 144850 1684.30 2014
## 5 Redwood City -> Palo Alto 2 6829 3414.50 2014
## 6 Redwood City -> Redwood City 27 15796 585.04 2014
## 7 San Francisco -> San Francisco 6071 5867541 966.49 2014
## 8 San Jose -> San Jose 453 1428963 3154.44 2014
table_trips_2015 = as.data.frame(table(trips_2015$trip_between_cities))
colnames(table_trips_2015) = c("trip_between_cities_2015", "num_trips")
duration_2015 = c()
for (i in table_trips_2015$trip_between_cities_2015) {
duration_2015 = c(duration_2015, sum(trips_2015$duration[trips_2015$trip_between_cities %in% i]))
}
table_trips_2015$duration = duration_2015
table_trips_2015$avg_duration = round(table_trips_2015$duration / table_trips_2015$num_trips, 2)
table_trips_2015
## trip_between_cities_2015 num_trips duration avg_duration
## 1 Mountain View -> Mountain View 230 329696 1433.46
## 2 Mountain View -> Palo Alto 1 1917 1917.00
## 3 Palo Alto -> Mountain View 7 52995 7570.71
## 4 Palo Alto -> Palo Alto 81 1282512 15833.48
## 5 Redwood City -> Redwood City 47 139199 2961.68
## 6 San Francisco -> San Francisco 6628 6684147 1008.47
## 7 San Jose -> San Jose 387 355687 919.09
table_trips_2015$year = rep(2015, dim(table_trips_2015)[1])
table_trips = data.frame(trip_between_cities = c(table_trips_2014$trip_between_cities_2014, table_trips_2015$trip_between_cities_2015),
num_trips = c(table_trips_2014$num_trips, table_trips_2015$num_trips),
duration = c(table_trips_2014$duration, table_trips_2015$duration),
avg_duration = c(table_trips_2014$avg_duration, table_trips_2015$avg_duration),
year = c(table_trips_2014$year, table_trips_2015$year))
write.csv(table_trips, "trips_summary_between_city_both.csv")
Insights
1. 2014
- San Francisco had the highest number of trips (6,071) and significant total duration (5,867,541 seconds, avg. 966.49 seconds per trip). It was the busiest and most central hub.
- San Jose had notable trips (453) with a high average duration (3,154.44 seconds), reflecting longer commutes.
- Outliers like Redwood City -> Palo Alto had very high average durations (3,414.50 seconds) despite fewer trips (2 trips), likely due to recreational or long-distance rides.
3. 2015
- San Francisco again dominated with 6,628 trips, an increase from 2014, and a similar average trip duration (1,008.47 seconds).
- Palo Alto -> Palo Alto stood out with 81 trips and the highest average duration (15,833.48 seconds), suggesting unique, long-duration usage patterns.
- San Jose showed shorter average trips (919.09 seconds) despite a considerable number of trips (387), indicating a shift toward shorter commutes.
data = data.frame(
Year = c("2014", "2015"),
Total_Trips = c(nrow(trips_2014), nrow(trips_2015)),
Total_Duration = c(sum(trips_2014$duration), sum(trips_2015$duration))
)
ggplot(data) +
geom_bar(aes(x = Year, y = Total_Trips, fill = "Total Trips"),
stat = "identity", position = "dodge", width = 0.4) +
geom_line(aes(x = Year, y = Total_Duration / 1000, group = 1, color = "Total Duration"),
linewidth = 1.5, linetype = "dashed") +
geom_point(aes(x = Year, y = Total_Duration / 1000, color = "Total Duration"),
size = 4) +
labs(title = "Total Trips and Total Duration in July 2014 and 2015",
x = "Year", y = "Value",
fill = "Legend", color = "Legend") +
scale_y_continuous(
name = "Total Trips",
sec.axis = sec_axis(~ . * 1000, name = "Total Duration (seconds)",
labels = scales::comma)
) +
scale_fill_manual(values = c("Total Trips" = "skyblue")) +
scale_color_manual(values = c("Total Duration" = "red")) +
theme_minimal()
avg_data = data.frame(
Year = c("2014", "2015"),
Average_Duration = c(mean(trips_2014$duration), mean(trips_2015$duration))
)
ggplot(avg_data, aes(x = Year, y = Average_Duration, fill = Year)) +
geom_bar(stat = "identity") +
labs(title = "Average Trip Duration in July 2014 and 2015",
x = "Year", y = "Average Duration (seconds)") +
theme_minimal() +
scale_y_continuous(labels = scales::comma)
We will first make a data frame contains start_station, end_station, num_trips, duration and avg_duration for both year:
popular_routes_num_trips_2014 = aggregate(
duration ~ start_station_name + end_station_name,
data = trips_2014,
FUN = length
)
colnames(popular_routes_num_trips_2014)[3] = "num_trips_2014"
popular_routes_duration_2014 = aggregate(
duration ~ start_station_name + end_station_name,
data = trips_2014,
FUN = sum
)
colnames(popular_routes_duration_2014)[3] = "duration_2014"
popular_routes_2014 = merge(
popular_routes_num_trips_2014,
popular_routes_duration_2014,
by = c("start_station_name", "end_station_name")
)
popular_routes_2014$avg_duration_2014 = round(popular_routes_2014$duration_2014 / popular_routes_2014$num_trips_2014, 2)
popular_routes_num_trips_2015 = aggregate(
duration ~ start_station_name + end_station_name,
data = trips_2015,
FUN = length
)
colnames(popular_routes_num_trips_2015)[3] = "num_trips_2015"
popular_routes_duration_2015 = aggregate(
duration ~ start_station_name + end_station_name,
data = trips_2015,
FUN = sum
)
colnames(popular_routes_duration_2015)[3] = "duration_2015"
popular_routes_2015 = merge(
popular_routes_num_trips_2015,
popular_routes_duration_2015,
by = c("start_station_name", "end_station_name")
)
popular_routes_2015$avg_duration_2015 = round(popular_routes_2015$duration_2015 / popular_routes_2015$num_trips_2015, 2)
head(popular_routes_2014[order(-popular_routes_2014$num_trips_2014), ], 5)
## start_station_name end_station_name
## 472 Harry Bridges Plaza (Ferry Building) Embarcadero at Sansome
## 383 Embarcadero at Sansome Steuart at Market
## 819 San Francisco Caltrain (Townsend at 4th) Embarcadero at Folsom
## 43 2nd at South Park Market at Sansome
## 595 Market at Sansome 2nd at South Park
## num_trips_2014 duration_2014 avg_duration_2014
## 472 82 107834 1315.05
## 383 50 20609 412.18
## 819 48 39512 823.17
## 43 46 18234 396.39
## 595 46 17118 372.13
head(popular_routes_2015[order(-popular_routes_2015$num_trips_2015), ], 5)
## start_station_name
## 466 Harry Bridges Plaza (Ferry Building)
## 1109 Townsend at 7th
## 68 2nd at Townsend
## 376 Embarcadero at Sansome
## 458 Harry Bridges Plaza (Ferry Building)
## end_station_name num_trips_2015 duration_2015
## 466 Embarcadero at Sansome 86 302489
## 1109 San Francisco Caltrain 2 (330 Townsend) 79 72478
## 68 Harry Bridges Plaza (Ferry Building) 78 61099
## 376 Steuart at Market 70 28426
## 458 2nd at Townsend 58 38755
## avg_duration_2015
## 466 3517.31
## 1109 917.44
## 68 783.32
## 376 406.09
## 458 668.19
head(popular_routes_2014[order(-popular_routes_2014$duration_2014), ],5)
## start_station_name end_station_name
## 114 Arena Green / SAP Center Adobe on Almaden
## 796 San Antonio Caltrain Station Castro Street and El Camino Real
## 784 Rengstorff Avenue / California Street Rengstorff Avenue / California Street
## 476 Harry Bridges Plaza (Ferry Building) Harry Bridges Plaza (Ferry Building)
## 892 San Jose City Hall Ryland Park
## num_trips_2014 duration_2014 avg_duration_2014
## 114 1 715339 715339.00
## 796 2 145090 72545.00
## 784 2 119454 59727.00
## 476 16 113310 7081.88
## 892 2 111430 55715.00
head(popular_routes_2015[order(-popular_routes_2015$duration_2015), ], 5)
## start_station_name end_station_name
## 1121 University and Emerson University and Emerson
## 466 Harry Bridges Plaza (Ferry Building) Embarcadero at Sansome
## 670 Mountain View City Hall Mountain View Caltrain Station
## 578 Market at 4th Market at 4th
## 391 Embarcadero at Vallejo Market at 4th
## num_trips_2015 duration_2015 avg_duration_2015
## 1121 10 1205549 120554.90
## 466 86 302489 3517.31
## 670 34 167330 4921.47
## 578 14 145523 10394.50
## 391 5 132222 26444.40
head(popular_routes_2014[order(-popular_routes_2014$avg_duration_2014), ],5)
## start_station_name end_station_name
## 114 Arena Green / SAP Center Adobe on Almaden
## 796 San Antonio Caltrain Station Castro Street and El Camino Real
## 784 Rengstorff Avenue / California Street Rengstorff Avenue / California Street
## 892 San Jose City Hall Ryland Park
## 118 Arena Green / SAP Center San Jose Diridon Caltrain Station
## num_trips_2014 duration_2014 avg_duration_2014
## 114 1 715339 715339
## 796 2 145090 72545
## 784 2 119454 59727
## 892 2 111430 55715
## 118 2 108624 54312
head(popular_routes_2015[order(-popular_routes_2015$avg_duration_2015), ], 5)
## start_station_name end_station_name
## 1121 University and Emerson University and Emerson
## 127 Arena Green / SAP Center San Salvador at 1st
## 784 Redwood City Caltrain Station Redwood City Caltrain Station
## 391 Embarcadero at Vallejo Market at 4th
## 1119 University and Emerson Mountain View Caltrain Station
## num_trips_2015 duration_2015 avg_duration_2015
## 1121 10 1205549 120554.9
## 127 1 95920 95920.0
## 784 1 31381 31381.0
## 391 5 132222 26444.4
## 1119 2 46119 23059.5
Insights
1. Most Popular Routes
- Harry Bridges Plaza (Ferry Building) to Embarcadero at Sansome was the most frequent route in both years (82 trips in 2014, 86 in 2015), solidifying its role as a central commuter hub.
- 2015 saw new popular routes like Townsend at 7th to San Francisco Caltrain 2, reflecting changing commuter patterns.
2. Routes with Longest Total Durations
- 2014: Outlier trips like Arena Green / SAP Center to Adobe on Almaden (715,339 seconds) dominated.
- 2015: University and Emerson to itself emerged as the longest total duration route (1,205,549 seconds), indicating increased recreational use.
3. Routes with Longest Average Durations
- Both years featured outliers (e.g., University and Emerson, Arena Green / SAP Center) with unusually long durations, likely non-commuter trips.
First, I will combine data for both years for comparison
popular_routes_combined = merge(
popular_routes_2014[, c("start_station_name", "end_station_name", "num_trips_2014")],
popular_routes_2015[, c("start_station_name", "end_station_name", "num_trips_2015")],
by = c("start_station_name", "end_station_name"),
all = TRUE
)
popular_routes_combined = popular_routes_combined[order(-popular_routes_combined$num_trips_2014, -popular_routes_combined$num_trips_2015), ]
Then, I only take the top 5 popular routes:
popular_routes_top5 = head(popular_routes_combined, 5)
popular_routes_top5_long = data.frame(
route = paste(popular_routes_top5$start_station_name, "->", popular_routes_top5$end_station_name),
num_trips = c(popular_routes_top5$num_trips_2014, popular_routes_top5$num_trips_2015),
year = rep(c("2014", "2015"), each = 5)
)
popular_routes_top5_long$route = paste(
substr(popular_routes_top5$start_station_name, 1, 10),
"...",
"->",
substr(popular_routes_top5$end_station_name, 1, 10),
"..."
)
And plotting
ggplot(popular_routes_top5_long, aes(x = route, y = num_trips, fill = year)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Most Popular Routes by Number of Trips (Top 5)",
x = "Route",
y = "Number of Trips",
fill = "Year"
) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
First, I combine data for both years
heatmap_data_2014 <- trips_2014[, c("start_station_name", "end_station_name")]
heatmap_data_2015 <- trips_2015[, c("start_station_name", "end_station_name")]
heatmap_data <- rbind(heatmap_data_2014, heatmap_data_2015)
And create heatmap data frame of top 25
heatmap_df <- as.data.frame(table(heatmap_data$start_station_name, heatmap_data$end_station_name))
colnames(heatmap_df) <- c("Start_Station", "End_Station", "Frequency")
heatmap_df = head(heatmap_df[order(-heatmap_df$Frequency), ], 25)
And plotting:
ggplot(heatmap_df, aes(x = Start_Station, y = End_Station, fill = Frequency)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
labs(
title = "Heatmap of Top 25 Station Usage",
x = "Start Station",
y = "End Station",
fill = "Frequency"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Overall, the data reflects growing bike-share usage, particularly in San Francisco, and a trend toward longer and more varied trips.