R for Stata Users: Complete Stata-to-R Reference Translation
Stata and R both excel at statistical analysis, but their syntax is worlds apart. This guide maps every common Stata command to its R equivalent so you can transition without losing productivity.
Stata users love its concise syntax — reg y x1 x2 is hard to beat for brevity. R is more verbose but also more flexible: you get 20,000+ packages, publication-quality graphics, and a full programming language. This guide helps you translate the Stata you know into idiomatic R.
Key Conceptual Differences
Before diving into syntax, understand these fundamental differences:
Concept
Stata
R
Data in memory
One dataset at a time
Multiple data frames simultaneously
Variable access
Just type the name
Use df$varname or with(df, ...)
Missing values
. (dot)
NA
String delimiter
"text"
"text" or 'text'
Comments
// comment or /* */
# comment
Command terminator
Newline
Newline (or ;)
Looping
forvalues, foreach
for, lapply, sapply
Output
Results window
Console (or R Markdown for reports)
Data Management
Loading and Saving Data
Stata
R Equivalent
use "data.dta"
haven::read_dta("data.dta")
save "data.dta", replace
haven::write_dta(df, "data.dta")
import delimited "data.csv"
read.csv("data.csv")
export delimited "data.csv"
write.csv(df, "data.csv", row.names = FALSE)
describe
str(df)
list in 1/10
head(df, 10)
count
nrow(df)
browse
View(df) (in RStudio)
Creating and Modifying Variables
Stata
R Equivalent
gen newvar = x + y
df$newvar <- df$x + df$y
replace x = 0 if x < 0
df$x[df$x < 0] <- 0
gen log_x = log(x)
df$log_x <- log(df$x)
gen age_sq = age^2
df$age_sq <- df$age^2
egen mean_x = mean(x), by(group)
df$mean_x <- ave(df$x, df$group, FUN = mean)
egen total_x = total(x), by(group)
df$total_x <- ave(df$x, df$group, FUN = sum)
egen rank_x = rank(x)
df$rank_x <- rank(df$x)
recode x (1/3=1)(4/6=2)(7/10=3)
df$x_r <- cut(df$x, c(0,3,6,10), labels=1:3)
encode strvar, gen(numvar)
df$numvar <- as.numeric(factor(df$strvar))
decode numvar, gen(strvar)
df$strvar <- as.character(df$numvar)
destring x, replace
df$x <- as.numeric(df$x)
tostring x, replace
df$x <- as.character(df$x)
label variable x "Label"
attr(df$x, "label") <- "Label"
drop x y
df$x <- NULL; df$y <- NULL
keep x y z
df <- df[, c("x", "y", "z")]
rename old new
names(df)[names(df) == "old"] <- "new"
# Stata gen/replace equivalents
df <- mtcars
df$car <- rownames(df)
# gen efficiency = mpg / wt
df$efficiency <- round(df$mpg / df$wt, 2)
# replace efficiency = 0 if efficiency < 5
df$efficiency[df$efficiency < 5] <- 0
# egen mean_mpg = mean(mpg), by(cyl)
df$mean_mpg <- ave(df$mpg, df$cyl, FUN = mean)
cat("=== First 6 rows ===\n")
print(head(df[, c("car", "mpg", "wt", "cyl", "efficiency", "mean_mpg")]))
▶ Run
↺ Reset
Subsetting and Sorting
Stata
R Equivalent
keep if x > 10
df <- subset(df, x > 10)
drop if missing(x)
df <- df[!is.na(df$x), ]
sort x
df <- df[order(df$x), ]
gsort -x
df <- df[order(-df$x), ]
sort group x
df <- df[order(df$group, df$x), ]
duplicates drop
df <- df[!duplicated(df), ]
duplicates list
df[duplicated(df), ]
Merging and Appending
Stata
R Equivalent
merge 1:1 id using "file2.dta"
merge(df1, df2, by = "id")
merge m:1 id using "lookup.dta"
merge(df1, lookup, by = "id", all.x = TRUE)
append using "file2.dta"
rbind(df1, df2)
reshape wide val, i(id) j(time)
reshape(df, direction="wide", idvar="id", timevar="time")
reshape long val, i(id) j(time)
reshape(df, direction="long", varying=..., idvar="id")
# Stata merge equivalent
customers <- data.frame(id = 1:5, name = c("Alice","Bob","Carol","Dave","Eve"))
purchases <- data.frame(id = c(1,2,2,5), amount = c(50, 30, 75, 120))
# merge 1:m id using purchases
result <- merge(customers, purchases, by = "id", all.x = TRUE)
cat("=== Left Join (m:1 merge) ===\n")
print(result)
▶ Run
↺ Reset
Statistical Analysis
Descriptive Statistics
Stata
R Equivalent
summarize x
summary(df$x)
summarize x, detail
summary(df$x) + sd(), quantile()
tabulate x
table(df$x)
tab x y
table(df$x, df$y)
tab x y, chi2
chisq.test(table(df$x, df$y))
correlate x y z
cor(df[, c("x","y","z")])
pwcorr x y z, sig
psych::corr.test(df[,c("x","y","z")])
# Stata summarize equivalent
cat("=== summarize mpg, detail ===\n")
x <- mtcars$mpg
stats <- data.frame(
Stat = c("N","Mean","SD","Min","P25","Median","P75","Max"),
Value = c(length(x), round(mean(x),2), round(sd(x),2),
min(x), quantile(x,0.25), median(x), quantile(x,0.75), max(x))
)
print(stats, row.names = FALSE)
▶ Run
↺ Reset
Regression
Stata
R Equivalent
reg y x1 x2
lm(y ~ x1 + x2, data = df)
reg y x1 x2, robust
lmtest::coeftest(model, vcov = sandwich::vcovHC)
reg y x1 x2, cluster(id)
lmtest::coeftest(model, vcov = sandwich::vcovCL)
logit y x1 x2
glm(y ~ x1 + x2, family = binomial)
probit y x1 x2
glm(y ~ x1 + x2, family = binomial(link="probit"))
ologit y x1 x2
MASS::polr(factor(y) ~ x1 + x2)
poisson y x1 x2
glm(y ~ x1 + x2, family = poisson)
tobit y x1 x2, ll(0)
AER::tobit(y ~ x1 + x2, left = 0)
predict yhat
fitted(model)
predict resid, residuals
residuals(model)
test x1 = x2
car::linearHypothesis(model, "x1 = x2")
vif
car::vif(model)
# Stata reg equivalent
cat("=== reg mpg wt hp ===\n")
model <- lm(mpg ~ wt + hp, data = mtcars)
print(summary(model))
▶ Run
↺ Reset
T-tests and ANOVA
Stata
R Equivalent
ttest x == 0
t.test(x, mu = 0)
ttest x, by(group)
t.test(x ~ group, data = df)
ttest x == y (paired)
t.test(x, y, paired = TRUE)
oneway y group, tabulate
summary(aov(y ~ factor(group), data = df))
anova y group1 group2
summary(aov(y ~ group1 * group2, data = df))
# Stata ttest equivalent
cat("=== ttest mpg, by(am) ===\n")
result <- t.test(mpg ~ am, data = mtcars)
cat("t =", round(result$statistic, 3), "\n")
cat("p-value =", round(result$p.value, 4), "\n")
cat("Mean (auto):", round(result$estimate[1], 2), "\n")
cat("Mean (manual):", round(result$estimate[2], 2), "\n")
▶ Run
↺ Reset
Panel Data
Stata
R Equivalent
xtset id time
Use plm package: pdata.frame(df, index = c("id","time"))
xtreg y x1, fe
plm(y ~ x1, data = pdf, model = "within")
xtreg y x1, re
plm(y ~ x1, data = pdf, model = "random")
hausman fe re
phtest(fe_model, re_model)
# Panel data concepts (without plm for WebR compatibility)
cat("=== Panel Data in R ===\n")
cat("For panel/longitudinal data, use the plm package:\n\n")
cat('library(plm)\n')
cat('pdf <- pdata.frame(df, index = c("id", "time"))\n')
cat('fe_model <- plm(y ~ x1 + x2, data = pdf, model = "within")\n')
cat('re_model <- plm(y ~ x1 + x2, data = pdf, model = "random")\n')
cat('phtest(fe_model, re_model) # Hausman test\n')
▶ Run
↺ Reset
Programming Constructs
Stata
R Equivalent
local x = 5
x <- 5
global x = 5
x <<- 5 (avoid this; use function args)
forvalues i = 1/10 { }
for (i in 1:10) { }
foreach v of varlist x y z { }
for (v in c("x","y","z")) { }
while condition { }
while (condition) { }
capture noisily command
tryCatch(expr, error = function(e) ...)
program define name
name <- function(...) { }
display "text"
cat("text\n")
return scalar r = val
return(val) in a function
do "script.do"
source("script.R")
Summary: Stata → R Quick Reference
Category
Stata Command
R Equivalent
Load data
use "file.dta"
haven::read_dta("file.dta")
Describe
describe
str(df)
Create var
gen x = expr
df$x <- expr
Filter
keep if cond
df <- subset(df, cond)
Sort
sort var
df <- df[order(df$var), ]
Merge
merge 1:1 id using file
merge(df1, df2, by = "id")
Summarize
summarize x
summary(df$x)
Tabulate
tab x y
table(df$x, df$y)
Regression
reg y x1 x2
lm(y ~ x1 + x2, data = df)
Logistic
logit y x1 x2
glm(y~x1+x2, family=binomial)
T-test
ttest x, by(g)
t.test(x ~ g, data = df)
ANOVA
oneway y group
aov(y ~ factor(group))
FAQ
Can R read Stata .dta files? Yes. Use haven::read_dta("file.dta"). It supports Stata versions 8 through 18 and preserves variable labels and value labels. Install with install.packages("haven").
How do I get Stata-style robust standard errors in R? Use the sandwich and lmtest packages: lmtest::coeftest(model, vcov = sandwich::vcovHC(model, type = "HC1")). The HC1 type matches Stata's robust option.
Is R slower than Stata for large datasets? For standard operations, R is comparable or faster — especially with data.table for very large data. R's vectorized operations and packages like collapse can outperform Stata on datasets with millions of rows.
What's Next