process_chats.py 4.26 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""Here, methods to filter out futile data points and create a statistical overview
 of the chat data set are provided."""

import re
import pandas as pd


# read data
def read_chat_data(path):
    """Read in the data and add respective column names."""
    columns = ["chat_id", "user_id", "message", "timestamp"]
    chat_df = pd.read_csv(path, header=None, names=columns)
    print(chat_df.head())
    return chat_df


def filter_chats(df):
    """Filter data to only contain chats with appropriate number of turns."""
    # set threshold
    threshold = 3
    # group messages
    grouped = df.groupby('chat_id')
    # filter for message count
    filtered = grouped.filter(lambda x: x['message'].count() > threshold)
    return filtered


def get_summary(df):
    """Calculate the overall average, min and max values across sample."""
    # mean_scores= df.mean(axis=0)
    summary = df.agg(['min', 'mean', 'max'], axis=0)
    print('statistic summary of data:', summary)
    return summary


def summarize_chats(df):
    """Get statistical summaries of messages per chat and message lengths."""
    # group
    grouped = df.groupby('chat_id')
    # n messages per chat
    n_messages_per_chat = grouped['message'].count()
    # get min, max and average 
    summary_chats = get_summary(n_messages_per_chat)
    # length of individual messages
    messages = df['message']
    # split individual messages and count words
    msg_lens = messages.str.split().str.len()
    # get min, max and average
    summary_msgs = get_summary(msg_lens)
    return msg_lens, summary_chats, summary_msgs


def get_n_count(df):
    """
    Get count of unique users and chats.
    Note: There was one test case left in the chat data.
    """
    # get unique ids
    uniq_users = df.user_id.unique()
    uniq_chats = df.chat_id.unique()
    # get n unique ids
    n_users = df.user_id.nunique()
    n_chats = df.chat_id.nunique()
    # n_chats = len(unique_chats)
    print('number of unique users: %i and number of unique chats: %i' % (n_users, n_chats))
    return uniq_users, uniq_chats, n_users, n_chats


def clean_messages(df):
    """ Clean dataframe from emoticons and other special tokens"""
    emoticons = re.compile('(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)')
    special_chars = re.compile('[$&+:;=|"@#<>^*()%/_-]')
    # apply regex to df
    df['message'] = df['message'].apply(lambda x: emoticons.sub(r'.', x))
    df['message'] = df['message'].apply(lambda x: special_chars.sub(r'', x))
    return df


def sort_chat_messages(df):
    """Sort messages by chat id and timestamp"""
    # convert string to datetime object
    df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S.%f")
    # sort and group by chat id first
    df = df.sort_values(['chat_id'], ascending=True)
    df = df.groupby(['chat_id'], sort=False)
    # then sort by datetime object
    df = df.apply(lambda x: x.sort_values(['timestamp'], ascending=True))
    df = df.reset_index(drop=True)
    return df


def concat_and_save_message_strings(df):
    # group, sort and concat message strings
    for group, frame in df.groupby('chat_id'):
        frame = frame.sort_values(['user_id'])
        # concat messages per user
        strings = frame.groupby(['user_id'])['message'].apply(' '.join).reset_index()
        strings.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/chats/{}.csv'.format(group), sep=';', index=False)


if __name__ == '__main__':
    # read data
Jana Germies's avatar
Jana Germies committed
103
    chat_data = read_chat_data('../outputs/ttas-filtered-chats.csv')
104
105
106
107
108
109
110
111
    # filter data
    filtered_chats = filter_chats(chat_data)
    print(filtered_chats.shape)
    # get n unique users and chats
    unique_users, unique_chats, number_users, number_chats = get_n_count(filtered_chats)
    # get n messages per chats, message lengths
    message_lens, chat_summary, summary_messages = summarize_chats(filtered_chats)
    # personality data
Jana Germies's avatar
Jana Germies committed
112
    trait_scores = pd.read_csv('../outputs/filtered_personality_scores.csv')
113
114
115
116
117
118


    # chat_data = chat_data.drop(columns=chat_data.columns[0], axis=1)
    # print(chat_data.head())
    # clean = clean_messages(filtered_chats)
    # sort = sort_chat_messages(clean)
Jana Germies's avatar
Jana Germies committed
119
    # sort.to_csv('../outputs/ttas_clean_chats.csv', sep=';', index=False)
120
    # concat_and_save_message_strings(chat_data)