How To Add Two Columns Of Values From Grouping Two By Two Values From Another Column
I would like to create a new pandas data-frame as a result of grouping text values which has the same value in other column. So for instance, I got the following dataframe: example
Solution 1:
For first use combinations of 2 values per groups in flattened list compreehnsion, there are groups with 1 values omitted by default:
example_dct = {
"text": {
"0": "this is my text 1",
"1": "this is my text 2",
"2": "this is my text 3",
"3": "this is my text 4",
"4": "this is my text 5",
"5": "this is my text 6",
},
"article_id": {
"0": "#0001_01_xml",
"1": "#0001_01_xml",
"2": "#0001_02_xml",
"3": "#0001_03_xml",
"4": "#0001_03_xml",
"5": "#0001_03_xml",
}
}
df = pd.DataFrame.from_dict(example_dct)
from itertools import combinations
L = [y + (name,) for name, x in df.groupby('article_id')['text'] for y in combinations(x, 2)]
df1 = pd.DataFrame(L, columns=['text_1','text_2', 'article_id'])
print(df1)
text_1 text_2 article_id
0 this is my text 1 this is my text 2 #0001_01_xml
1 this is my text 4 this is my text 5 #0001_03_xml
2 this is my text 4 this is my text 6 #0001_03_xml
3 this is my text 5 this is my text 6 #0001_03_xml
So if changed values 0001_02_xml
to 0001_03_xml
get:
example_dct = {
"text": {
"0": "this is my text 1",
"1": "this is my text 2",
"2": "this is my text 3",
"3": "this is my text 4",
"4": "this is my text 5",
"5": "this is my text 6",
},
"article_id": {
"0": "#0001_01_xml",
"1": "#0001_01_xml",
"2": "#0001_03_xml",
"3": "#0001_03_xml",
"4": "#0001_03_xml",
"5": "#0001_03_xml",
}
}
df = pd.DataFrame.from_dict(example_dct)
from itertools import combinations
L = [y + (name,) for name, x in df.groupby('article_id')['text'] for y in combinations(x, 2)]
df1 = pd.DataFrame(L, columns=['text_1','text_2', 'article_id'])
print(df1)
text_1 text_2 article_id
0 this is my text 1 this is my text 2 #0001_01_xml
1 this is my text 3 this is my text 4 #0001_03_xml
2 this is my text 3 this is my text 5 #0001_03_xml
3 this is my text 3 this is my text 6 #0001_03_xml
4 this is my text 4 this is my text 5 #0001_03_xml
5 this is my text 4 this is my text 6 #0001_03_xml
6 this is my text 5 this is my text 6 #0001_03_xml
For second use:
df2 = (df.assign(a=1).merge(df.assign(a=1), on='a', suffixes=('_1','_2'))
.merge(df1, indicator=True, how='left')
.query('_merge == "left_only" & article_id_1 != article_id_2')
[['text_1','text_2', 'article_id_1','article_id_2']]
)
print (df2)
text_1 text_2 article_id_1 article_id_2
2 this is my text 1 this is my text 3 #0001_01_xml #0001_02_xml
3 this is my text 1 this is my text 4 #0001_01_xml #0001_03_xml
4 this is my text 1 this is my text 5 #0001_01_xml #0001_03_xml
5 this is my text 1 this is my text 6 #0001_01_xml #0001_03_xml
8 this is my text 2 this is my text 3 #0001_01_xml #0001_02_xml
9 this is my text 2 this is my text 4 #0001_01_xml #0001_03_xml
10 this is my text 2 this is my text 5 #0001_01_xml #0001_03_xml
11 this is my text 2 this is my text 6 #0001_01_xml #0001_03_xml
12 this is my text 3 this is my text 1 #0001_02_xml #0001_01_xml
13 this is my text 3 this is my text 2 #0001_02_xml #0001_01_xml
15 this is my text 3 this is my text 4 #0001_02_xml #0001_03_xml
16 this is my text 3 this is my text 5 #0001_02_xml #0001_03_xml
17 this is my text 3 this is my text 6 #0001_02_xml #0001_03_xml
18 this is my text 4 this is my text 1 #0001_03_xml #0001_01_xml
19 this is my text 4 this is my text 2 #0001_03_xml #0001_01_xml
20 this is my text 4 this is my text 3 #0001_03_xml #0001_02_xml
24 this is my text 5 this is my text 1 #0001_03_xml #0001_01_xml
25 this is my text 5 this is my text 2 #0001_03_xml #0001_01_xml
26 this is my text 5 this is my text 3 #0001_03_xml #0001_02_xml
30 this is my text 6 this is my text 1 #0001_03_xml #0001_01_xml
31 this is my text 6 this is my text 2 #0001_03_xml #0001_01_xml
32 this is my text 6 this is my text 3 #0001_03_xml #0001_02_xml
Solution 2:
example_dct = {
"text": {
"0": "this is my text 1",
"1": "this is my text 2",
"2": "this is my text 3",
"3": "this is my text 4",
"4": "this is my text 5",
"5": "this is my text 6",
},
"article_id": {
"0": "#0001_01_xml",
"1": "#0001_01_xml",
"2": "#0001_02_xml",
"3": "#0001_03_xml",
"4": "#0001_03_xml",
"5": "#0001_03_xml",
}
}
df_example = pd.DataFrame.from_dict(example_dct)
print(df_example)
text article_id
0 this is my text 1 #0001_01_xml
1 this is my text 2 #0001_01_xml
2 this is my text 3 #0001_02_xml
3 this is my text 4 #0001_03_xml
4 this is my text 5 #0001_03_xml
5 this is my text 6 #0001_03_xml
df_example=df_example[
df_example.duplicated(subset=['article_id'],keep=False)
]
df_example2=df_example
df=df_example.merge(df_example2,on='article_id',how='inner')
df['no_x']=df.text_x.str.extract(r'text (\d+)').astype(float)
df['no_y']=df.text_y.str.extract(r'text (\d+)').astype(float)
df = df[
df.no_x < df.no_y
]
del df['no_x']
del df['no_y']
print(df)
text_x article_id text_y
1 this is my text 1 #0001_01_xml this is my text 2
5 this is my text 4 #0001_03_xml this is my text 5
6 this is my text 4 #0001_03_xml this is my text 6
9 this is my text 5 #0001_03_xml this is my text 6
Post a Comment for "How To Add Two Columns Of Values From Grouping Two By Two Values From Another Column"