A Python/R cheat sheet for SQL developers. The variable “df” is the data frame / table name. For python code, pandas has been imported as pd, and numpy as np. For R, some code assumes dplyr is loaded via libary(dplyr).
Select individual column
select column_name from df
df.column_name # Alternative df['column_name'] # Using a variable myvar = 'column_name' df[myvar]
df$column_name # Alternative, needed if column name has spaces/weird characters df$`column_name` # Alternative df[, 'column_name'] # Using a variable myvar <- 'column_name' df[, myvar]
Select multiple columns
select column_a, column_b, column_c from df
# By name df[['column_a', 'column_b', 'column_c']] # By index, where they are the first three columns df.iloc[:, 0:4]
# By name
df[, c('column_a', 'column_b', 'column_c')]
# By name using dplyr
df %>% select(column_a, column_b, column_c)
# By index, where they are the first three columns
df[, 1:3]
Top 10 rows
select * from df limit 10
df.head(10)
# standard way head(df, 10) # with dplyr df %>% head(10)
Unique Values and count for a column
# Just the values select distinct(column_name) from df # Values with a count select column_name, count(*) from df group by column_name
# Just the values df.column_name.unique() # Values with a count df.column_name.value_counts()
# Just the values unique(df$column_name) # Values with a count table(df$column_name) # Values with a count using dplyr df %>% select(column_name) %>% count(column_name)
Filtering by column value
# Equal to a specified value
select * from df where col_name = 'value'
# Equal to a value in a set
select * from df where col_name in ('value1', 'value2')
# Value is empty
select * from df where col_name is null
# Find a word within a string
select * from df where col_string like '%word%'
# Filter by numeric value greater than 100
select * from df where col_numeric > 100
# Equal to a specified value
df[df.col_name == 'value']
# Equal to a value in a set
df[df.col_name.isin(['value1', 'value2'])]
# Value is empty
df[df.col_name.isnull()]
# Find a word within a string
df[df.col_string.str.contains('word') == True]
# Filter by numeric value greater than 100
df[df.col_numeric > 100]
# Equal to a specified value
df[df$col_name == 'value', ]
# Equal to a value in a set
df[df$col_name %in% c('value1', 'value2'), ]
# Value is empty
df[is.na(df$col_name), ]
# Find a word within a string
df[grepl('word', df$col_string, fixed=T), ]
# Filter by numeric value greater than 100
df[df$col_name > 100, ]
Multi column and multi table/dataframe conditions
# Condition between two columns of same table select * from df where col_a != col_b # Inner join between two tables select df1.col_a, df1.col_b, df2.col_x, df2.col_y from df1, df2 where df1.col_common = df2.col_common
# Condition between two columns of same table df[df['col_a'] != df['col_b']] # Inner join between two tables pd.merge(df1, df2, on='col_common', how='inner')[['col_a', 'col_b', 'col_x', 'col_y']]
# Condition between two columns of same table df[df$col_a != df$col_b, ] # Inner join between two tables (uses dplyr) merge(df1, df2, by="col_common") %>% select(col_a, col_b, col_x, col_y)
Update and Alter Statements
# Setting values that are null to fill_value
update df set column_name = 'fill_value' where column_name is null
# Setting values conditionally
update df set column_name = 'new_value'
where column_name in ('old_value1', 'old_value2')
# Conditional delete of rows
delete from df where column_name = 'some_value'
# Create copy of column old_column in new column called new_column
# (Oracle syntax - other DBs differ)
alter table df add (new_column varchar2(50));
update df set new_column = old_column;
# Renaming a column (Oracle syntax - other DBs differ)
alter table df rename column "old column name" "newcolumnname"
# Setting values that are null to fill_value
df['column_name'] = df['column_name'].fillna('fill_value')
# Setting values conditionally
df.loc[df['column_name'].isin(['old_value1', 'old_value2']),
'column_name'] = 'new_value'
# Conditional delete of rows
df.drop(np.where(df['column_name'] == 'some_value')[0], inplace=True)
# Create copy of column old_column in new column called new_column
df['new_column'] = df['old_column']
# Renaming a column
df.rename(columns = {'old column name':'newcolumnname'}, inplace=True)
# Setting values that are null to fill_value
df$column_name[is.na(df$column_name)] <- 'fill_value'
# Setting values conditionally
df$column_name[df$column_name %in% c('old_value1', 'old_value2')] <- 'new_value'
# Conditional delete of rows
df <- df[df$column_name != 'some_value',]
# Create copy of column old_column in new column called new_column
df$new_column <- df$old_column
# Create copy of column old column in new column called new_column (i.e. deal with spaces)
df$new_column = df$`old column`
# Renaming a column
names(df)[names(df) == "old column name"] <- "newcolumnname"
# Renaming a column with dplyr
df <- df %>% rename("newcolumname" = "old column name")
