个人兴趣项目:聊天记录数据分析

/ 技术文章 / 0 条评论 / 2910浏览

ReadMe

这是一个个人兴趣项目,用于分析和朋友之间的聊天记录,应该不涉及隐私吧,好像没聊啥不能看的东西... 顺便定期复习一下Python和Pandas库的用法

需求

从微信聊天记录中分析以下信息:

数据准备

因为我的手机是IPHONE因此只能通过,iTunes备份数据库,用其他工具,比如PP手机助手,导出备份数据(在AppDomain-com.tencent.xin:/文件夹下);找到Documents\f09a6328ee5d561a2e4abcb1e9836337\fts\fts_message.db, 然后用sqlcipher.exe打开备份数据库可以看到所有聊天记录都被存在fts_message_table_N_content表里(N是0-9)。

找到数据就好办了,通过SQL语句找到想要分析的记录,然后导出,下面是我的查询语句,渣SQL,但能用;找到记录并插入临时表中,之后使用工具的导出功能,导出为csv文件

CREATE TABLE 'fts_message_lucia_content'(docid INTEGER PRIMARY KEY, 'c0usernameid', 'c1MesLocalID', 'c2CreateTime', 'c3Message', 'c4reservedInt', 'c5reservedText', 'c6primary');

insert into  fts_message_lucia_content
select * from (
select * from fts_message_table_0_content
union
select * from fts_message_table_1_content
union
select * from fts_message_table_2_content
union
select * from fts_message_table_3_content
union
select * from fts_message_table_4_content
union
select * from fts_message_table_5_content
union
select * from fts_message_table_6_content
union
select * from fts_message_table_7_content
union
select * from fts_message_table_8_content
union
select * from fts_message_table_9_content
) where c0Usernameid=194

出来的文件大概是这样的

"docid","c0usernameid","c1MesLocalID","c2CreateTime","c3Message","c4reservedInt","c5reservedText","c6primary"
"18357","194","1","1550239275","我通过了你的朋友验证请求,现在我们可以开始聊天了","0","",""
"18358","194","2","1550239473","Hi","0","",""
"18359","194","5","1550239547","xxxxxxxxxxxx?","0","",""
"18360","194","6","1550239586","xxxxxxxxxxxxxxxxx??","0","",""
"18361","194","7","1550239617","xxxxxxxxxxxxxxxxxx","0","",""
"18362","194","9","1550239743","听说你是it工程师?","0","",""
"18363","194","10","1550239759","码农,码农","0","",""
...

数据分析

词频统计

import matplotlib.pyplot as plt
from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from os import path
import pandas
import jieba
import jieba.analyse


# 构造词云生成类
class Cloud:
  def __init__(self, content, img_filepath, font_filepath):
    
    #使用结巴分词 
    myStopwords = set(STOPWORDS)
    myStopwords.add("什么")
    myStopwords.add("这个")
    myStopwords.add("那个")
    myStopwords.add("嗯")
    myStopwords.add("恩")
    tags = jieba.analyse.extract_tags(content, topK=200, withWeight=False)
    self.d = path.dirname(__name__)
    self.text = " ".join(tags)
    self.img = imread(img_filepath)
   
    self.wc = WordCloud(background_color='white', #背景颜色 
                          max_words=2000, # 词云显示的最大词数
                          max_font_size=200, #字体最大值
                          mask=self.img,  #设置背景图片
                          min_font_size=10, 
                          stopwords=myStopwords,
                          mode='RGBA',
                          font_path = font_filepath, #设置中文字体,使得词云可以显示(词云默认字体是“DroidSansMono.ttf字体库”,不支持中文)
                          random_state=40 #颜色种类
                         )
    self.wc.generate(self.text)
  #展示词云 
  def show_wc(self):
    img_color = ImageColorGenerator(self.img)
    plt.figure(figsize=(15,15))
    plt.imshow(self.wc.recolor(color_func=img_color))
    plt.axis('off')
    plt.show()
  #保存词云
  def save_wc(self, out_file_name):
    self.wc.to_file(path.join(self.d, out_file_name))
    
#获取csv 中的聊天内容
def readCsv(path):
    dataFrame = pandas.read_csv(path, encoding = "utf-8")
    messages = dataFrame['c3Message']

    messagesInAll = " "
    for label, content in messages.iteritems():
        messagesInAll += str(content) + " "
  
    return messagesInAll

if __name__ == '__main__':
  content = readCsv('msg_content_lucia.csv')
  fo = open("text.txt", "w")
  fo.write(content)
  fo.close()
  #给词云配置内容,图片,字体(为了使中文字体不会乱码
  wc = Cloud(content, 'background.jpg', 'Muyao-Softbrush.ttf')
  wc.show_wc()
  wc.save_wc('world_cloud3.png')

词云结果

聊天时间分析

处理数据

import time
import math
import pandas
from collections import OrderedDict

first_date=1550160000

def to_hour(t):
    struct_time = time.localtime(t)#将时间戳转换为struct_time元组
    hour = round((struct_time[3] + struct_time[4] / 60), 2)
    return hour

def to_weekday(t):
    return pandas.to_datetime(t,unit='s').weekday()

def to_date(t):
    return math.ceil((float(t)-float(first_date))/(60*60*24))

path='msg_content_lucia.csv'
dataFrame = pandas.read_csv(path, encoding = "utf-8")
dataFrame['Date'] = dataFrame['c2CreateTime'].apply(lambda t: time.strftime("%D", time.localtime(t)))

chat_time = dataFrame['c2CreateTime']
chat_time.head()

df_date_count = dataFrame.groupby(['Date']).size().reset_index(name='count')
date_set = [to_date(i) for i in chat_time]
hour_set = [to_hour(i) for i in chat_time]
weekday_set = [to_weekday(i) for  i in chat_time]

df_date_count.head()

画图

import seaborn as sns 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import *#如果想在图上显示中文,需导入这个包

myfont = FontProperties(fname=r'Muyao-Softbrush.ttf',size=22)#标题字体样式
sns.set_style('darkgrid')#设置图片为深色背景且有网格线


#    - 认识至今每天聊天次数趋势图
sns.kdeplot(date_set, shade=True, color='lightcoral');
plt.yticks(fontsize=15)
plt.title('每日聊天趋势', fontproperties=myfont)
plt.ylabel('聊天分布', fontproperties=myfont)
fig1 = plt.gcf()
fig1.set_size_inches(15,8)
fig1.savefig('chat_date.png',dpi=100)
plt.show() 

#    - 一周内按每天(周几)聊天对话次数平均值
sns.distplot(weekday_set, 7, color='lightcoral')
plt.xticks(np.arange(0, 7, 1.0),['周一','周二','周三','周四','周五','周六','周日'], fontsize=15, fontproperties=myfont)
plt.yticks(fontsize=15)
plt.title('一周聊天日期分布', fontproperties=myfont)
plt.ylabel('聊天分布', fontproperties=myfont)
fig2 = plt.gcf()
fig2.set_size_inches(15,8)
fig2.savefig('chat_weekday.png',dpi=100)
plt.show() 

#    - 每天按时间段聊天次数平均值
sns.distplot(hour_set, 24, color='lightcoral')
plt.xticks(np.arange(0, 25, 1.0), fontsize=15)
plt.yticks(fontsize=15)
plt.title('聊天时间分布', fontproperties=myfont)
plt.xlabel('时间段', fontproperties=myfont)
plt.ylabel('聊天分布', fontproperties=myfont)
fig3 = plt.gcf()
fig3.set_size_inches(15,8)
fig3.savefig('chat_time.png',dpi=100)
plt.show() 

聊天趋势 因为时间不是很长,好像也看不出什么,等以后时间长了再看看吧

每周日分布 周末明明见面的呀,为什么还会聊得这么多嘞?

小时分布 很明显我们的共同休息时间就只有中午和晚上睡前...