SQL Statement equivalents in Python and R

A Python/R cheat sheet for SQL developers. The variable “df” is the data frame / table name. For python code, pandas has been imported as pd, and numpy as np. For R, some code assumes dplyr is loaded via libary(dplyr).

Select individual column

select column_name from df
df.column_name

# Alternative 
df['column_name']

# Using a variable
myvar = 'column_name'
df[myvar]
df$column_name

# Alternative, needed if column name has spaces/weird characters
df$`column_name`

# Alternative
df[, 'column_name']

# Using a variable
myvar <- 'column_name'
df[, myvar]

Select multiple columns

select column_a, column_b, column_c from df
# By name
df[['column_a', 'column_b', 'column_c']]

# By index, where they are the first three columns
df.iloc[:, 0:4]
# By name
df[, c('column_a', 'column_b', 'column_c')]

# By name using dplyr
df %>% select(column_a, column_b, column_c)

# By index, where they are the first three columns
df[, 1:3]

Top 10 rows

select * from df limit 10
df.head(10)
# standard way
head(df, 10)

# with dplyr
df %>% head(10)

Unique Values and count for a column

# Just the values
select distinct(column_name) from df

# Values with a count
select column_name, count(*) from df
group by column_name
# Just the values
df.column_name.unique()

# Values with a count
df.column_name.value_counts()
# Just the values
unique(df$column_name)

# Values with a count
table(df$column_name)

# Values with a count using dplyr
df %>% select(column_name) 
   %>% count(column_name)

Filtering by column value

# Equal to a specified value
select * from df where col_name = 'value'

# Equal to a value in a set
select * from df where col_name in ('value1', 'value2')

# Value is empty
select * from df where col_name is null

# Find a word within a string
select * from df where col_string like '%word%'

# Filter by numeric value greater than 100
select * from df where col_numeric > 100
# Equal to a specified value
df[df.col_name == 'value']

# Equal to a value in a set
df[df.col_name.isin(['value1', 'value2'])]

# Value is empty
df[df.col_name.isnull()]

# Find a word within a string
df[df.col_string.str.contains('word') == True]

# Filter by numeric value greater than 100
df[df.col_numeric > 100]
# Equal to a specified value
df[df$col_name == 'value', ]

# Equal to a value in a set
df[df$col_name %in% c('value1', 'value2'), ]

# Value is empty
df[is.na(df$col_name), ]

# Find a word within a string
df[grepl('word', df$col_string, fixed=T), ]

# Filter by numeric value greater than 100
df[df$col_name > 100, ]

Multi column and multi table/dataframe conditions

# Condition between two columns of same table
select * from df where col_a != col_b

# Inner join between two tables
select df1.col_a, df1.col_b, df2.col_x, df2.col_y from df1, df2
where df1.col_common = df2.col_common 

# Condition between two columns of same table
df[df['col_a'] != df['col_b']]

# Inner join between two tables
pd.merge(df1, df2, on='col_common', how='inner')[['col_a', 'col_b', 'col_x', 'col_y']]
# Condition between two columns of same table
df[df$col_a != df$col_b, ]

# Inner join between two tables (uses dplyr)
merge(df1, df2, by="col_common") %>% 
  select(col_a, col_b, col_x, col_y)

Update and Alter Statements

# Setting values that are null to fill_value
update df set column_name = 'fill_value' where column_name is null

# Setting values conditionally
update df set column_name = 'new_value'
where column_name in ('old_value1', 'old_value2')

# Conditional delete of rows
delete from df where column_name = 'some_value'

# Create copy of column old_column in new column called new_column 
# (Oracle syntax - other DBs differ)
alter table df add (new_column varchar2(50));
update df set new_column = old_column;

# Renaming a column (Oracle syntax - other DBs differ)
alter table df rename column "old column name" "newcolumnname" 

# Setting values that are null to fill_value
df['column_name'] = df['column_name'].fillna('fill_value')

# Setting values conditionally
df.loc[df['column_name'].isin(['old_value1', 'old_value2']), 
       'column_name'] = 'new_value'

# Conditional delete of rows
df.drop(np.where(df['column_name'] == 'some_value')[0], inplace=True)

# Create copy of column old_column in new column called new_column 
df['new_column'] = df['old_column']

# Renaming a column
df.rename(columns = {'old column name':'newcolumnname'}, inplace=True)
# Setting values that are null to fill_value
df$column_name[is.na(df$column_name)] <- 'fill_value'

# Setting values conditionally
df$column_name[df$column_name %in% c('old_value1', 'old_value2')] <- 'new_value'

# Conditional delete of rows
df <- df[df$column_name != 'some_value',]

# Create copy of column old_column in new column called new_column 
df$new_column <- df$old_column

# Create copy of column old column in new column called new_column (i.e. deal with spaces)
df$new_column = df$`old column`

# Renaming a column
names(df)[names(df) == "old column name"] <- "newcolumnname"

# Renaming a column with dplyr
df <- df %>% rename("newcolumname" = "old column name")