Skip to content Skip to sidebar Skip to footer

How To Add Two Columns Of Values From Grouping Two By Two Values From Another Column

I would like to create a new pandas data-frame as a result of grouping text values which has the same value in other column. So for instance, I got the following dataframe: example

Solution 1:

For first use combinations of 2 values per groups in flattened list compreehnsion, there are groups with 1 values omitted by default:

example_dct = {
  "text": {
    "0": "this is my text 1",
    "1": "this is my text 2",
    "2": "this is my text 3",
    "3": "this is my text 4",
    "4": "this is my text 5",
    "5": "this is my text 6",
  },
  "article_id": {
    "0": "#0001_01_xml",
    "1": "#0001_01_xml",
    "2": "#0001_02_xml",
    "3": "#0001_03_xml",
    "4": "#0001_03_xml", 
    "5": "#0001_03_xml",
  }
}

df = pd.DataFrame.from_dict(example_dct) 

from  itertools import  combinations

L = [y + (name,) for name, x in df.groupby('article_id')['text'] for y in combinations(x, 2)]
df1 = pd.DataFrame(L, columns=['text_1','text_2', 'article_id'])
print(df1)
              text_1             text_2    article_id
0thisis my text 1thisis my text 2  #0001_01_xml
1thisis my text 4thisis my text 5  #0001_03_xml
2thisis my text 4thisis my text 6  #0001_03_xml
3thisis my text 5thisis my text 6  #0001_03_xml

So if changed values 0001_02_xml to 0001_03_xml get:

example_dct = {
  "text": {
    "0": "this is my text 1",
    "1": "this is my text 2",
    "2": "this is my text 3",
    "3": "this is my text 4",
    "4": "this is my text 5",
    "5": "this is my text 6",
  },
  "article_id": {
    "0": "#0001_01_xml",
    "1": "#0001_01_xml",
    "2": "#0001_03_xml",
    "3": "#0001_03_xml",
    "4": "#0001_03_xml", 
    "5": "#0001_03_xml",
  }
}

df = pd.DataFrame.from_dict(example_dct) 

from  itertools import  combinations

L = [y + (name,) for name, x in df.groupby('article_id')['text'] for y in combinations(x, 2)]
df1 = pd.DataFrame(L, columns=['text_1','text_2', 'article_id'])
print(df1)
              text_1             text_2    article_id
0thisis my text 1thisis my text 2  #0001_01_xml
1thisis my text 3thisis my text 4  #0001_03_xml
2thisis my text 3thisis my text 5  #0001_03_xml
3thisis my text 3thisis my text 6  #0001_03_xml
4thisis my text 4thisis my text 5  #0001_03_xml
5thisis my text 4thisis my text 6  #0001_03_xml
6thisis my text 5thisis my text 6  #0001_03_xml

For second use:

df2 = (df.assign(a=1).merge(df.assign(a=1), on='a', suffixes=('_1','_2'))
         .merge(df1, indicator=True, how='left')
          .query('_merge == "left_only" &  article_id_1 != article_id_2')
          [['text_1','text_2', 'article_id_1','article_id_2']]
         )
print (df2)
               text_1             text_2  article_id_1  article_id_2
2thisis my text 1thisis my text 3  #0001_01_xml  #0001_02_xml
3thisis my text 1thisis my text 4  #0001_01_xml  #0001_03_xml
4thisis my text 1thisis my text 5  #0001_01_xml  #0001_03_xml
5thisis my text 1thisis my text 6  #0001_01_xml  #0001_03_xml
8thisis my text 2thisis my text 3  #0001_01_xml  #0001_02_xml
9thisis my text 2thisis my text 4  #0001_01_xml  #0001_03_xml
10thisis my text 2thisis my text 5  #0001_01_xml  #0001_03_xml
11thisis my text 2thisis my text 6  #0001_01_xml  #0001_03_xml
12thisis my text 3thisis my text 1  #0001_02_xml  #0001_01_xml
13thisis my text 3thisis my text 2  #0001_02_xml  #0001_01_xml
15thisis my text 3thisis my text 4  #0001_02_xml  #0001_03_xml
16thisis my text 3thisis my text 5  #0001_02_xml  #0001_03_xml
17thisis my text 3thisis my text 6  #0001_02_xml  #0001_03_xml
18thisis my text 4thisis my text 1  #0001_03_xml  #0001_01_xml
19thisis my text 4thisis my text 2  #0001_03_xml  #0001_01_xml
20thisis my text 4thisis my text 3  #0001_03_xml  #0001_02_xml
24thisis my text 5thisis my text 1  #0001_03_xml  #0001_01_xml
25thisis my text 5thisis my text 2  #0001_03_xml  #0001_01_xml
26thisis my text 5thisis my text 3  #0001_03_xml  #0001_02_xml
30thisis my text 6thisis my text 1  #0001_03_xml  #0001_01_xml
31thisis my text 6thisis my text 2  #0001_03_xml  #0001_01_xml
32thisis my text 6thisis my text 3  #0001_03_xml  #0001_02_xml

Solution 2:

example_dct = {
  "text": {
    "0": "this is my text 1",
    "1": "this is my text 2",
    "2": "this is my text 3",
    "3": "this is my text 4",
    "4": "this is my text 5",
    "5": "this is my text 6",
  },
  "article_id": {
    "0": "#0001_01_xml",
    "1": "#0001_01_xml",
    "2": "#0001_02_xml",
    "3": "#0001_03_xml",
    "4": "#0001_03_xml",
    "5": "#0001_03_xml",
  }
}
df_example = pd.DataFrame.from_dict(example_dct)
print(df_example)
                text    article_id
0  this is my text 1#0001_01_xml1  this is my text 2#0001_01_xml2  this is my text 3#0001_02_xml3  this is my text 4#0001_03_xml4  this is my text 5#0001_03_xml5  this is my text 6#0001_03_xml

df_example=df_example[
    df_example.duplicated(subset=['article_id'],keep=False)
]
df_example2=df_example

df=df_example.merge(df_example2,on='article_id',how='inner')
df['no_x']=df.text_x.str.extract(r'text (\d+)').astype(float)
df['no_y']=df.text_y.str.extract(r'text (\d+)').astype(float)

df = df[
    df.no_x < df.no_y
]
del df['no_x']
del df['no_y']

print(df)

              text_x    article_id             text_y
1  this is my text 1#0001_01_xml  this is my text 25  this is my text 4#0001_03_xml  this is my text 56  this is my text 4#0001_03_xml  this is my text 69  this is my text 5#0001_03_xml  this is my text 6

Post a Comment for "How To Add Two Columns Of Values From Grouping Two By Two Values From Another Column"