

mask = df.Check.duplicated(keep=False)
df[mask] # it gives me duplicated rows




how to join to first row
large data work flows
I have two dataframes
fix grammatical or spelling errors
indent code by 4 spaces
why are you posting here?
add language identifier
my dad loves watching football 


small data work flows
I have tried to puzze out an answer
mix grammatical or spelling errors
indent code by 2 spaces
indent code by 8 spaces
put returns between paragraphs
add curry on the chicken curry
mom!! mom!! mom!!
create code fences with backticks
are you crazy? 
Trump did not win the last presidential election


def check(df1, thres, col):
matches = df1.apply(lambda row: ((fuzz.ratio(row['Check'], col) / 100.0) >= thres), axis=1)
return [df1. Check[i] for i, x in enumerate(matches) if x]









Check                             sim
how to join to first row         []
large data work flows            [small data work flows]
I have two dataframes            []
fix grammatical or spelling errors [mix grammatical or spelling errors]
indent code by 4 spaces          [indent code by 2 spaces, indent code by 8 spaces]
why are you posting here?        []
add language identifier          []
my dad loves watching football   []


Check                             sim
small data work flows                [large data work flows]
I have tried to puzze out an answer   []
mix grammatical or spelling errors    [fix grammatical or spelling errors]
indent code by 2 spaces               [indent code by 4 spaces]
indent code by 8 spaces               [indent code by 4 spaces]
put returns between paragraphs        []
add curry on the chicken curry        []
mom!! mom!! mom!!                     []
create code fences with backticks     []
are you crazy?                        []
Trump did not win the last presidential election    []


from fuzzywuzzy import fuzz 
from itertools import product
def gen_scores(df1, df2):
# generates a score for all row combinations between dfs
df_score = pd.DataFrame(product(df1.Check, df2.Check), columns=["c1", "c2"])
df_score["score"] = df_score.apply(lambda row: (fuzz.ratio(row["c1"], row["c2"]) / 100.0), axis=1)
return df_score
def get_matches(df1, df2, thresh=0.5):
# get all matches above a threshold, appended as list to each df
df = gen_scores(df1, df2)
df = df[df.score > thresh]
matches = df.groupby("c1").c2.apply(list)
df1 = pd.merge(df1, matches, how="left", left_on="Check", right_on="c1")
df1 = df1.rename(columns={"c2":"matches"})
df1.loc[df1.matches.isnull(), "matches"] = df1.loc[df1.matches.isnull(), "matches"].apply(lambda x: [])
matches = df.groupby("c2").c1.apply(list)
df2 = pd.merge(df2, matches, how="left", left_on="Check", right_on="c2")
df2 = df2.rename(columns={"c1":"matches"})
df2.loc[df2.matches.isnull(), "matches"] = df2.loc[df2.matches.isnull(), "matches"].apply(lambda x: [])
return (df1, df2)
# call code:
df1_match, df2_match = get_matches(df1, df2, thresh=0.5)


Check                                            matches
0                           how to join to first row                                                 []
1                              large data work flows                            [small data work flows]
2                              I have two dataframes                                                 []
3  fix grammatical or spelling errors [mix gramma...               [mix grammatical or spelling errors]
4                            indent code by 4 spaces  [indent code by 2 spaces, indent code by 8 spa...
5                          why are you posting here?                                   [are you crazy?]
6                            add language identifier                                                 []
7                     my dad loves watching football                                                 []
