$
, [,]
, and subset()
$
and <-
# Create a dataframe called study
study <- data.frame(id = c(1:8),
sex = c("m", "f", "m", "m", "m", "f", "m", "x34"),
age = c(28, 24, 19, 23, 42, 32, 27, 24),
eyecolor = c("blue", "brown", "brown", "green", "blue", "brown", "blue", "green"),
group = c(1, 1, 1, 1, 2, 2, 2, 2),
score = c(78, 65, 94, 92, 84, 86, 92, 86),
stringsAsFactors = FALSE)
# Summary statistics from specific columns
mean(study$age) # Mean age
table(study$sex) # Counts of each sex
mean(study$sex == "m") # Percent that are men
mean(study$eyecolor %in% c("blue", "brown")) # Percent of eye colors that are blue or brown
# Indexing
study[1:5,] # First 5 rows
study[6:10, c("id", "sex", "score")] # Rows 6-10 and columns id, sex and score
# Subsetting
study_men <- subset(study, sex == "m")
study_g1 <- subset(study, group == 1)
study_g2 <- subset(study, group == 2)
# Different ways to do the same subsetting
# Q: What is the mean score of group 2?
study_g2 <- subset(study, group == 2) # Method 1A: Create study_g2 dataframe
mean(study_g2$score) # 1B: Calculate mean of study_g2$score
mean(subset(study, group == 2)$score) # Method 2: Same as method 1 but in one step
with(subset(study, group == 2), mean(score)) # Method 3: Using with() and subset()
mean(study$score[study$group == 2]) # Method 4: Using []
# Q: What percent of women over the age of 20 had brown eyes?
study.women <- subset(study, sex == "f" & age > 20) # Method 1A:
mean(study.women$eyecolor == "brown") # 1B:
mean(subset(study, sex == "f" & age > 20)$eyecolor == "brown") # Method 2:
with(subset(study, sex == "f" & age > 20), mean(eyecolor == "brown")) # Method 3:
mean(study$eyecolor[study$sex == "f" & study$age > 20] == "brown") # Method 4:
# Changing values of a vector in a dataframe
# Change sex values that are NOT f or m to NA
study$sex[study$sex %in% c("f", "m") == FALSE] <- NA
# Change "f" to "female", and "m" to "male"
study$sex[study$sex == "f"] <- "female"
study$sex[study$sex == "m"] <- "male"
# Changing column names
# Change name of first column to participant.id
names(study)[1] <- "patient.id"
# Change the name of columns 2 through 4
names(study)[2:4] <- c("gender", "age_years", "eye")
# Change name of group column to condition
names(study)[names(study) == "group"] <- "condition"
In a provocative paper, Bargh, Chen and Burrows (1996) sought to test whether or not priming people with trait concepts would trigger trait-consistent behavior. In one study, they primed participants with either neutral words (e.g.; bat, cookie, pen), or with words related to an elderly stereotype (e.g.; wise, stubborn, old). They then, unbeknownst to the participants, used a stopwatch to record how long it took the participants to walk down a hallway at the conclusion of an experiment. They predicted that participants primed with words related to the elderly would walk slower than those primed with neutral words.
In this WPA, you will analyze fake data corresponding to this study.
Our fake study has data from the following measures;
Variable | Description | Possible Values |
---|---|---|
prime |
What kind of primes was the participant given? | neutral , elderly |
prime.duration |
How long (in minutes) were primes displayed to participants? | 1, 5, 10, or 30 |
grandparents |
Did the participant have a close relationship with their grandparents? | yes means yes, no means no, none means they never met their grandparents. |
id |
The order in which participants completed the study | Integers from 1 to 500 |
age |
Participants’ age | Integers larger than 18 |
sex |
Participant’s sex | "m" = male, "f" = female |
attention |
Did the participant pass an attention check? | 1 = yes, 0 = no |
walk |
How long (in seconds) did participants take to walk down the hallway? | Positive numbers |
priming.txt
. It is available at https://raw.githubusercontent.com/ndphillips/IntroductionR_Course/master/data/priming.txt. You can load the data into R as a new dataframe called priming
by running the following:priming <- read.table(file = "https://raw.githubusercontent.com/ndphillips/IntroductionR_Course/master/data/priming.txt",
stringsAsFactors = FALSE)
Here is how the data should look:
a | b | c | d | e | f | g | h |
---|---|---|---|---|---|---|---|
1 | m | 21 | 1 | asdf | 1 | no | 25.4 |
2 | m | 21 | 1 | asdf | 30 | no | 23.6 |
3 | f | 22 | 1 | asdf | 30 | none | 34.5 |
4 | m | 23 | 1 | elderly | 1 | yes | 40.4 |
5 | m | 23 | 1 | asdf | 10 | none | 25.0 |
6 | m | 22 | 1 | asdf | 10 | yes | 24.7 |
View()
, summary()
, head()
and str()
.View(priming)
summary(priming)
head(priming)
str(priming)
names()
. Those aren’t very informative are they? Change the names to the correct values (make sure to use the naming scheme I describe in the dataset description).names(priming) <- c("id", "sex", "age", "attention", "prime", "prime.duration", "grandparents", "walk")
age.v
that contains the age data, then calculate the mean age from this vector. Do you get the same result?mean(priming$age)
## [1] 21.996
age.v <- priming$age
mean(age.v)
## [1] 21.996
median(priming$walk)
## [1] 34.7
sum(priming$sex == "f")
## [1] 252
sum(priming$sex == "m")
## [1] 248
mean()
)mean(priming$attention)
## [1] 0.886
walk_m
that shows the walking time in minutes rather than seconds.priming$walk_m <- priming$walk / 60
priming$sex[1:10]
## [1] "m" "m" "f" "m" "m" "m" "f" "m" "f" "m"
priming[50,]
## id sex age attention prime prime.duration grandparents walk walk_m
## 50 50 m 21 1 elderly 1 none 34.3 0.5716667
Try answering these questions using one of the methods in the Examples above. The easiest method is Method 1. That is, first create a new dataframe object of the subsetted data, and then calculate the summary data from this new object.
mean(priming$walk[priming$prime == "elderly"])
## [1] 39.17612
mean(priming$walk[priming$prime == "neutral"])
## [1] 26.50167
mean(priming$walk[priming$age < 23])
## [1] 28.8359
mean(priming$walk[priming$sex == "f" & priming$grandparents == "yes"])
## [1] 34.22515
mean(priming$walk[priming$sex == "m" & priming$age > 21 & priming$grandparents == "none"])
## [1] 25.76126
id
, prime
, and walk
. Create a new dataframe called priming_simple
that only contains these columns.priming.simple <- priming[c("id", "prime", "walk")]
priming_c
(aka., priming clean) that only includes rows with valid values for each column – do this by looking for an few strange values in each column, and by looking at the original dataset description. Additionally, only include participants who passed the attention check. Here’s a skeleton of how your code should look# Create priming_c, a subset of the original priming data
# (replace __ with the appropriate values)
priming_c <- subset(priming,
subset = sex %in% c(_____) &
age > ____ &
attention == ___ &
prime %in% c(___) &
prime.duration %in% c(___) &
grandparents %in% c(___) &
walk > ___ )
# Create priming_c, a subset of the original priming data
# (replace __ with the appropriate values)
priming_c <- subset(priming,
subset = sex %in% c("m", "f") &
age > 18 &
attention == 1 &
prime %in% c("elderly", "neutral") &
prime.duration %in% c(1, 5, 10, 30) &
grandparents %in% c("no", "none", "yes") &
walk > 0)
nrow(priming_c)
## [1] 291
with(subset(priming_c, prime == "elderly"), mean(walk))
## [1] 41.93209
with(subset(priming_c, prime == "neutral"), mean(walk))
## [1] 30.25669
identical()
(look at the help function with ?identical
to see how it works)v1 <- priming$walk
v2 <- priming["walk"]
v3 <- priming[,names(priming) == "walk"]
# v1 and v3 are vectors, while v2 is a dataframe
vA <- priming$walk
vB <- subset(priming, select = "walk")
# vA is a vector while vB is a dataframe
mean(vA)
## [1] 30.09736
mean(vB)
## [1] NA
# mean(vB) doesn't work because you can't take the mean of a dataframe.
Note: The following questions apply to your cleaned dataframe (priming_c
)
id
!)?mean(priming_c$walk[priming_c$id <= 50 & priming_c$prime == "elderly"]) - mean(priming_c$walk[priming_c$id <= 50 & priming_c$prime == "neutral"])
## [1] 11.04059
mean(priming_c$walk[priming_c$id >= 450 & priming_c$prime == "elderly"]) - mean(priming_c$walk[priming_c$id >= 450 & priming_c$prime == "neutral"])
## [1] 10.21579
# Strong relationship only
mean(priming_c$walk[priming_c$grandparents == "yes" & priming_c$prime == "elderly"]) - mean(priming_c$walk[priming_c$grandparents == "yes" & priming_c$prime == "neutral"])
## [1] 13.57851
# No relationship only
mean(priming_c$walk[priming_c$grandparents == "none" & priming_c$prime == "elderly"]) - mean(priming_c$walk[priming_c$grandparents == "none" & priming_c$prime == "neutral"])
## [1] 9.667544
priming_c
dataframe.priming_c <- priming_c[priming_c$id %in% seq(1, 501, by = 2),]
wpa_3_LastFirst.R
file to me at nathaniel.phillips@unibas.ch.