Main focus is to investigate the dataset Gapminder and interact with it. To illustrate the basic use of EDA in the dplyr,ggplot2 package, I use a “gapminder” datasets. This data is a data.frame created for the purpose of predicting sales volume.
- Using the dplyr package to perform data transformation and manipulation operations.
- Using the ggplot2 package to visually analyze our data.
Load Packages
#install.packages("gapminder")
library(gapminder)
library(dplyr)
library(ggplot2)
The variables are explained as follows: Country — factor with 142 levels Continent — Factor with 5 levels Year — ranges from 1952 to 2007 in increments of 5 years lifeExp — life expectancy at birth, in years pop — population dgoPercap — GDP per capita
head(gapminder_unfiltered,5) #Unfiltered data
data:image/s3,"s3://crabby-images/fc311/fc311e42fe76ad2e002d63502c1d1d880ae0948e" alt=""
tail(gapminder_unfiltered,5)
data:image/s3,"s3://crabby-images/e7759/e775974fc1563bd0f133c6e2945d2d4e7c661415" alt=""
Display name of Variables :
names(gapminder_unfiltered)
data:image/s3,"s3://crabby-images/43300/433005cb2585af58058d53b8025ecf7d49fbfab6" alt=""
Data Cleaning :
Finding the missing values as we can see this data has no missing values
str(gapminder_unfiltered)
data:image/s3,"s3://crabby-images/1f5c3/1f5c30c47ea76f6abe015e33497ad8d61cd284c3" alt=""
summary(gapminder_unfiltered) # see (Other) :2965
data:image/s3,"s3://crabby-images/ad335/ad33520db1e5d5ab20b0da8d6c27f0409e6e7409" alt=""
sum(is.na(gapminder_unfiltered))
[1] 0
Hence , we found zero NA values from this dataset .
Display the continent , country and year
unique(gapminder_unfiltered %>% select(year ,country, continent))
length(unique(gapminder_unfiltered$continent))
unique(gapminder_unfiltered$year)
Structure
glimpse(gapminder_unfiltered)
data:image/s3,"s3://crabby-images/c81e0/c81e06a21cd386f5a794f8eb2616baa0cb0f5ab9" alt=""
Summary Calculating descriptive statistics using describe()
Hmisc: Harrell Miscellaneous
Contains many functions useful for data analysis, high-level graphics, utility operations, functions for computing sample size and power, importing and annotating datasets, imputing missing values, advanced table making, variable clustering, character string manipulation, conversion of R objects to LaTeX and html code, and recoding variables.
library(Hmisc)
describe(gapminder_unfiltered)
data:image/s3,"s3://crabby-images/539a1/539a17e62e2a6a6660b1411ce6b4442d06e94647" alt=""
Exploratory Data Analysis
data:image/s3,"s3://crabby-images/fbd49/fbd4971128aeec990c75d27017414cb9bc298d66" alt=""
plot(gapminder_unfiltered) :
data:image/s3,"s3://crabby-images/f07a9/f07a97c0c90647f82fb8128724dc7d8906c92e92" alt=""
boxplot(lifeExp ~ continent)
:
data:image/s3,"s3://crabby-images/9b05f/9b05fc2071b2082b6a23e6926268dffe7edf6e02" alt=""
plot(lifeExp ~ log(gdpPercap),col = gdpPercap) :
data:image/s3,"s3://crabby-images/89b37/89b373bed3dae9c24e58b6477aa6d3a3f8a0ee73" alt=""
For the year 2007, what is the distribution of GDP per capita across all countries?
GDP_2007 <- gapminder_unfiltered %>% filter(year == 2007) %>% select(continent,country,gdpPercap)
GDP_2007
data:image/s3,"s3://crabby-images/738f1/738f10c93db00792d7dd3dd2ab82a2d8ee6e32ca" alt=""
ggplot(GDP_2007 ,aes(x =gdpPercap ))+geom_histogram( fill= "cyan" ,
bins = 40)+ ggtitle("Distribution of GDP per capita across all countries for 2007")+ylab("GDP per Capita")+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
data:image/s3,"s3://crabby-images/eb1c3/eb1c30bf182a42212e4cedf5f9ecbd713fa72fcc" alt=""
ggplot(GDP_2007, aes(x=country, y=gdpPercap)) +
geom_point(aes(color = continent)) +
ylab("GDP per Capita") +
ggtitle("GDP Per Capita for Contries grouped by Continents for 2007")+
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
data:image/s3,"s3://crabby-images/c82ae/c82aea44bbc34a4ab75313ffaf1633e2ad6ee9c1" alt=""
For the year 2007, how do the distributions differ across the different continents?
ggplot(GDP_2007, aes(x=continent, y=gdpPercap)) +geom_bar(fill = "green",stat = "identity") +
xlab("Continents") +
ylab("GDP per Capita") +
ggtitle("GDP Per Capita vs Continents for 2007")
data:image/s3,"s3://crabby-images/2848e/2848e5cd17a936449dbed4e51792f347526a522d" alt=""
ggplot(GDP_2007, aes(continent,gdpPercap))+geom_jitter(aes(color = "fireebrick"))+
xlab("Continents")+ylab("GDP per capita")+ggtitle("GDP per capital Vs Continents for 2007")
data:image/s3,"s3://crabby-images/bf3c2/bf3c2619461451ea5cacc067ade5718bb06e896c" alt=""
For the year 2007, what are the top 10 countries with the largest GDP per capita?
top_10_gdpc<- GDP_2007[order(GDP_2007$gdpPercap , decreasing = TRUE),2:3][1:10,] #,2:3 select col 2 to 3 only and show 1:10 entries
top_10_gdpc
we can see the GDP per capita for specific country
GDP_2007[GDP_2007$country == "India",]
Top 10 GDP per capita by country
ggplot(top_10_gdpc, aes(x=country, y=gdpPercap)) +
geom_bar(fill="palegreen2", stat = "identity") +
xlab("Top 10 Countries") +ylab("GDP per Capita") +
ggtitle("Top 10 GDP Per Capita vs Countries")
data:image/s3,"s3://crabby-images/f64aa/f64aaf9e767381c392520672b36f1898b1cb0466" alt=""
Plot the GDP per capita for your country of origin for all years available
GDP_India <- gapminder_unfiltered %>% filter(country == "India") %>%select(year,gdpPercap)
ggplot(GDP_India ,aes(year,gdpPercap , col =year ))+ geom_point()+geom_smooth()+
xlab("year")+ylab("GDP per capita")+ggtitle("GDP Per Capita vs Year for INDIA")
data:image/s3,"s3://crabby-images/fa696/fa6968a69907c988aa7d258e9d15fcd83282169e" alt=""
GDP per capita less than 50000 ,lifeExp and Continent
library(ggplot2)
gapminder %>% filter(gdpPercap < 50000 ) %>% ggplot(aes(log(gdpPercap),lifeExp,col = continent))+geom_point(alpha = 0.5)+geom_smooth(method = lm)+facet_wrap(~continent)
data:image/s3,"s3://crabby-images/c6751/c6751cffe4c86c8f4957f5134592a0550338c2a3" alt=""
GPD per capita less than 50000 ,lifeExp and year
gapminder %>% filter(gdpPercap < 50000 ) %>% ggplot(aes(log(gdpPercap),lifeExp,col = year))+geom_point(alpha = 0.5)+geom_smooth(method = lm)+facet_wrap(~continent)
data:image/s3,"s3://crabby-images/907b7/907b77670d5f275596fd8b4845ad6488c828f3bc" alt=""
Life Expectancy of countries :
library(dplyr)
gapminder_unfiltered %>%
select(country , lifeExp) %>% filter(country == "United States" | country== "India" )%>% group_by( country) %>% summarise( avg_lifeExp = mean(lifeExp))
data:image/s3,"s3://crabby-images/ddb07/ddb07655241d335a5dfa31d464d627089cb30c4c" alt=""
Check the life Expectancy using T test :
df1 <- gapminder_unfiltered %>% select(country , lifeExp) %>%filter(country == "United States" | country== "India" )
t.test(data = df1 ,lifeExp ~ country )
data:image/s3,"s3://crabby-images/d2ee8/d2ee89471c9b6215a1f5977a91d8d878b7bacf3a" alt=""
After Observing the “df” and “P value” there is significant difference in avg lifeExp of India and United States , so We reject the Null hypothesis here Pvalue is 5.311e-06.
Bonus Information Just Introduction : ## Regression :
summary(lm(lifeExp ~ gdpPercap))
data:image/s3,"s3://crabby-images/f5c9f/f5c9f744628dcbc786b76c175765f16af0ccc9f6" alt=""
summary(lm(lifeExp ~ gdpPercap+pop+continent))
data:image/s3,"s3://crabby-images/c7db4/c7db48ab88ff94b3cb66762ce4facb535d2010f4" alt=""
Comments
Post a Comment