import requests
import os
import pandas as pd
import re
import numpy as np

import matplotlib.pyplot as plt

#download file

folder_name = 'download'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

#print(response)

with open(os.path.join(folder_name, url.split('/')[-1]), mode='wb') as file:
	file.write(response.content)

df = pd.read_csv('download/image-predictions.tsv', sep='\t')

df

#twitter-archive-enhanced.csv

df1 = pd.read_csv('twitter-archive-enhanced.csv')
df1

#tweepy

import tweepy

consumer_key = '-------------'
consumer_secret = '---------------'
access_token = '-----------------'
access_secret = '------------'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

# public_tweets = api.home_timeline()
# for tweet in public_tweets:
#     print(tweet.text)

8643
38970

line_count = 0

with open('tweet_json.txt', encoding='utf-8', mode = 'w') as file:
    for i in range(len(df1)):
        try:
            result = api.get_status(str(df1.at[i,'tweet_id']))
            print(i, df1.at[i,'tweet_id'])
            file.write(str(result._json) + '\n')
            line_count += 1
        except tweepy.TweepError as e:
            print(i, e.reason)
            
    
print(line_count)

0 892420643555336193
1 892177421306343426
2 891815181378084864
3 891689557279858688
4 891327558926688256
5 891087950875897856
6 890971913173991426
7 890729181411237888
8 890609185150312448
9 890240255349198849
10 890006608113172480
11 889880896479866881
12 889665388333682689
13 889638837579907072
14 889531135344209921
15 889278841981685760
16 888917238123831296
17 888804989199671297
18 888554962724278272
19 [{'code': 144, 'message': 'No status found with that ID.'}]
20 888078434458587136
21 887705289381826560
22 887517139158093824
23 887473957103951883
24 887343217045368832
25 887101392804085760
....

#Synthesize df2 from tweet_json.txt

# I assigned the line_count value from previous cell here directly in 
# order to run this cell individually since it will take some time to
# query twitter API

line_count = 2345

df2_list =[]

regex_id = '\'id\': ([0-9]+)'
regex_retweet = '\'retweet_count\': ([0-9]+)'
regex_fav = '\'favorite_count\': ([0-9]+)'

r_id = re.compile(regex_id)
r_ret = re.compile(regex_retweet)
r_fav = re.compile(regex_fav)

count = 0
with open('tweet_json.txt', encoding='utf-8') as file:
    while count < line_count:
        title = file.readline()
#         line1 = title.split(',')
#         id = line1[1][7:len(line1[1])]
        idre = r_id.findall(title)
        favc = r_fav.findall(title)
        retc = r_ret.findall(title)
        
#         fav_count = line1[len(line1)-6][19:len(line1[len(line1)-6])]
#         ret_count = line1[len(line1)-7][18:len(line1[len(line1)-7])]

        df2_list.append({'tweet_id' : str(idre[0]),
                        'retweet_count' : retc[0],
                        'favorite_count' : favc[0]})
        #print(count, idre[0], favc[0], retc[0])
        count += 1
        
df2 = pd.DataFrame(df2_list, columns = ['tweet_id', 'retweet_count', 'favorite_count'])

df2

df[df.tweet_id.duplicated()]

df1[df1.tweet_id.duplicated()]

df2[df2.id.duplicated()]

sum(df.tweet_id.isnull())

0

sum(df1.tweet_id.isnull())

0

sum(df1.rating_numerator.isnull())

0

sum(df2.id.isnull())

0

df1.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

df1.rating_numerator.value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
......
Name: rating_numerator, dtype: int64

df1[(df1['rating_numerator'] >= 20)]

df1[(df1['rating_denominator'] < 10)]

pd.options.display.max_colwidth = -1

df1[(df1['name'].str.islower() == True)]

df1[df1.name.duplicated()]

df1[df1.name.isnull()]

df1[(df1['text'].str.slice(start=0, stop=2) == 'RT')]

df1[df1.retweeted_status_id.notnull()]

df1_clean['name'].describe()

count     2247
unique    932 
top       None
freq      745 
Name: name, dtype: object

pd.options.display.max_colwidth = -1


df1[(df1['source'].str.slice(start=72, stop=-4) != 'iPhone')]


# df1['source'].str.slice(start=72, stop=-4)

# df1['source'].str.slice(start=0, stop=-4)

df2['tweet_id'].describe()

count     2345              
unique    2345              
top       778650543019483137
freq      1                 
Name: tweet_id, dtype: object

df_clean = df.copy()
df1_clean = df1.copy()
df2_clean = df2.copy()

df1_clean = df1_clean.loc[~(df1_clean.retweeted_status_id.notnull())]

df1_clean[(df1_clean['text'].str.slice(start=0, stop=2) == 'RT')]

# df1_clean

df1_clean = df1_clean.reset_index()

df1_clean['source'] = df1_clean.source.str.extract('([a-zA-Z0-9-]+\b[a-zA-Z0-9-]+|[a-zA-Z0-9-]+)</a>', expand=True)


for i in range(len(df1_clean)):
    if df1_clean.loc[i, 'source'] == "Client":
        df1_clean.loc[i, 'source'] = "Twitter Web Client"
    if df1_clean.loc[i, 'source'] == "Scene":
        df1_clean.loc[i, 'source'] = "Vine"

df1_clean.source.value_counts()

iPhone                2042
Vine                    91
Twitter Web Client      31
TweetDeck               11
Name: source, dtype: int64

df_col = ['p1', 'p2', 'p3']

for i in range(3):
    df_clean[df_col[i]] = df_clean[df_col[i]].str.capitalize()

df_clean[['p1', 'p2', 'p3']]

regex_name2 = 'named ([A-Z][a-z]+)|is ([A-Z][a-z]+)'

r_name2 = re.compile(regex_name2)
                   
                         
for i in range(len(df1_clean)):
    if df1_clean.loc[i, 'name'].islower():
                         
        df1_text = df1_clean.loc[i, 'text']
        df1_name = r_name2.findall(df1_text)
        
        if len(df1_name) == 1:
            
            if df1_name[0][0]:
                df1_clean.at[i, 'name'] = df1_name[0][0]
                print(df1_name)
                
            if df1_name[0][1]:
                df1_clean.at[i, 'name'] = df1_name[0][1]
                print(df1_name)
   
        if len(df1_name) == 0:
            df1_clean.at[i, 'name'] = "None"

[('', 'Freudian')]
[('', 'Arctic')]
[('', 'Zoey')]
[('', 'Quizno')]
[('', 'Jamaican')]
[('', 'Alaskan')]
[('', 'Bulgarian')]
[('', 'Dutch')]
[('', 'Mongolian')]
[('Wylie', '')]
[('Kip', '')]
[('Jacob', '')]
[('Rufus', '')]
[('Spork', '')]
[('Cherokee', '')]
[('Hemry', '')]
[('Alphred', '')]
[('Alfredo', '')]
[('Leroi', '')]
[('Berta', '')]
[('Chuk', '')]
[('Alfonso', '')]
[('Cheryl', '')]
[('Jessiga', '')]
[('Klint', '')]
[('Kohl', '')]
[('', 'Daryl')]
[('Pepe', '')]
[('Octaviath', '')]
[('Johm', '')]

df1_clean[(df1_clean['tweet_id'] == 666701168228331520)]

df1_clean[(df1_clean['tweet_id'] == 748692773788876800)]

df1_clean.sample(200)

df1_clean.replace("None", np.nan, inplace=True)

df1_clean.sample(200)

pd.options.display.max_colwidth = -1
df1_clean[['rating_numerator', 'rating_denominator', 'text' ]][(df1_clean['rating_denominator'] != 10)]

df1_clean.loc[263, 'rating_numerator'] = 13
df1_clean.loc[263, 'rating_denominator'] = 10


#set both numerator and denominator to 0 if there is no rating in the text.
df1_clean.loc[287, 'rating_numerator'] = 0
df1_clean.loc[287, 'rating_denominator'] = 0

df1_clean.loc[429, 'rating_numerator'] = 0
df1_clean.loc[429, 'rating_denominator'] = 0

df1_clean.loc[890, 'rating_numerator'] = 14
df1_clean.loc[890, 'rating_denominator'] = 10

df1_clean.loc[987, 'rating_numerator'] = 13
df1_clean.loc[987, 'rating_denominator'] = 10

df1_clean.loc[1483, 'rating_numerator'] = 10
df1_clean.loc[1483, 'rating_denominator'] = 10

df1_clean.loc[2154, 'rating_numerator'] = 9
df1_clean.loc[2154, 'rating_denominator'] = 10


#remove those records that doesn't have a rating.

df1_clean = df1_clean.loc[(df1_clean['rating_denominator'] != 0)]

df1_clean.rating_denominator.value_counts()

10     2158
50     3   
80     2   
170    1   
150    1   
130    1   
120    1   
110    1   
90     1   
70     1   
40     1   
20     1   
16     1   
Name: rating_denominator, dtype: int64

df1_clean = df1_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis = 1)

df1_clean

df2_clean['tweet_id'] = df2_clean['tweet_id'].astype(int)
df2_clean['retweet_count'] = df2_clean['retweet_count'].astype(int)
df2_clean['favorite_count'] = df2_clean['favorite_count'].astype(int)

df2_clean['tweet_id'].describe()

count    2.345000e+03
mean     7.422940e+17
std      6.833642e+16
min      6.660209e+17
25%      6.783802e+17
50%      7.189392e+17
75%      7.986979e+17
max      8.924206e+17
Name: tweet_id, dtype: float64

df_master_clean = pd.merge(df_clean, df1_clean, on=['tweet_id'])

df_master_clean = pd.merge(df_master_clean, df2_clean, on=['tweet_id'])

df_master_clean

df_master_clean.to_csv('twitter_archive_master.csv')

df_master = pd.read_csv('twitter_archive_master.csv')

#The breeds of dog in the top 20 favorited tweets.

df_top_fav = df_master.sort_values('favorite_count', ascending=False).head(20)[['p1', 'p1_dog', 'favorite_count']]

df_top_fav

#The breeds of dog in the top 20 retweeted tweets.

df_top_ret = df_master.sort_values('retweet_count', ascending=False).head(20)[['p1', 'p1_dog', 'retweet_count']]

df_top_ret

df_top_rated = df_master[['p1', 'p1_dog', 'rating']][(df_master['rating'] >= 1.4)].sort_values('rating', ascending=False)

df_top_rated

df_top_dogs = df_top_fav.append(df_top_ret).append(df_top_rated)

df_top_dogs = df_top_dogs[(df_top_dogs['p1_dog'] == True)]


df_top_dogs

df_top_dogs.p1.value_counts().sort_values(ascending=True).plot(kind='barh', figsize=(8, 8), rot=0)
# plt.axis('off')
# plt.axis('equal')
plt.title('The most popular breed found')

Text(0.5,1,'The most popular breed found')

#This part is not in the report

df_master.source.value_counts()

iPhone                1954
Twitter Web Client      28
TweetDeck               11
Name: source, dtype: int64

df_master.source.value_counts().plot(kind='pie', figsize=(8, 8), autopct='%.2f')
plt.axis('off')
plt.axis('equal')
plt.title('Source pie chart')

Text(0.5,1,'Source pie chart')

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	666020888022790149	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg	1	Welsh_springer_spaniel	0.465074	True	collie	0.156665	True	Shetland_sheepdog	0.061428	True
1	666029285002620928	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg	1	redbone	0.506826	True	miniature_pinscher	0.074192	True	Rhodesian_ridgeback	0.072010	True
2	666033412701032449	https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg	1	German_shepherd	0.596461	True	malinois	0.138584	True	bloodhound	0.116197	True
3	666044226329800704	https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg	1	Rhodesian_ridgeback	0.408143	True	redbone	0.360687	True	miniature_pinscher	0.222752	True
4	666049248165822465	https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg	1	miniature_pinscher	0.560311	True	Rottweiler	0.243682	True	Doberman	0.154629	True
5	666050758794694657	https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg	1	Bernese_mountain_dog	0.651137	True	English_springer	0.263788	True	Greater_Swiss_Mountain_dog	0.016199	True
6	666051853826850816	https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg	1	box_turtle	0.933012	False	mud_turtle	0.045885	False	terrapin	0.017885	False
7	666055525042405380	https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg	1	chow	0.692517	True	Tibetan_mastiff	0.058279	True	fur_coat	0.054449	False
8	666057090499244032	https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg	1	shopping_cart	0.962465	False	shopping_basket	0.014594	False	golden_retriever	0.007959	True
9	666058600524156928	https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg	1	miniature_poodle	0.201493	True	komondor	0.192305	True	soft-coated_wheaten_terrier	0.082086	True
10	666063827256086533	https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg	1	golden_retriever	0.775930	True	Tibetan_mastiff	0.093718	True	Labrador_retriever	0.072427	True

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
0	892420643555336193	NaN	NaN	2017-08-01 16:23:56 +0000	<a href="http://twitter.com/download/iphone" r...	This is Phineas. He's a mystical boy. Only eve...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892420643...	13	10	Phineas	None	None	None	None
1	892177421306343426	NaN	NaN	2017-08-01 00:17:27 +0000	<a href="http://twitter.com/download/iphone" r...	This is Tilly. She's just checking pup on you....	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892177421...	13	10	Tilly	None	None	None	None
2	891815181378084864	NaN	NaN	2017-07-31 00:18:03 +0000	<a href="http://twitter.com/download/iphone" r...	This is Archie. He is a rare Norwegian Pouncin...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891815181...	12	10	Archie	None	None	None	None
3	891689557279858688	NaN	NaN	2017-07-30 15:58:51 +0000	<a href="http://twitter.com/download/iphone" r...	This is Darla. She commenced a snooze mid meal...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891689557...	13	10	Darla	None	None	None	None
4	891327558926688256	NaN	NaN	2017-07-29 16:00:24 +0000	<a href="http://twitter.com/download/iphone" r...	This is Franklin. He would like you to stop ca...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891327558...	12	10	Franklin	None	None	None	None
5	891087950875897856	NaN	NaN	2017-07-29 00:08:17 +0000	<a href="http://twitter.com/download/iphone" r...	Here we have a majestic great white breaching ...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891087950...	13	10	None	None	None	None	None

	tweet_id	retweet_count	favorite_count
0	892420643555336193	8641	38967
1	892177421306343426	6351	33354
2	891815181378084864	4213	25136
3	891689557279858688	8759	42299
4	891327558926688256	9521	40472
5	891087950875897856	3155	20293
6	890971913173991426	2104	11900
7	890729181411237888	19157	65817
8	890609185150312448	4321	27876
9	890240255349198849	7524	32075
10	890006608113172480	7439	30775

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
188	855862651834028034	8.558616e+17	1.943518e+08	2017-04-22 19:15:32 +0000	<a href="http://twitter.com/download/iphone" r...	@dhmontgomery We also gave snoop dogg a 420/10...	NaN	NaN	NaN	NaN	420	10	None	None	None	None	None
189	855860136149123072	8.558585e+17	1.361572e+07	2017-04-22 19:05:32 +0000	<a href="http://twitter.com/download/iphone" r...	@s8n You tried very hard to portray this good ...	NaN	NaN	NaN	NaN	666	10	None	None	None	None	None
290	838150277551247360	8.381455e+17	2.195506e+07	2017-03-04 22:12:52 +0000	<a href="http://twitter.com/download/iphone" r...	@markhoppus 182/10	NaN	NaN	NaN	NaN	182	10	None	None	None	None	None
313	835246439529840640	8.352460e+17	2.625958e+07	2017-02-24 21:54:03 +0000	<a href="http://twitter.com/download/iphone" r...	@jonnysun @Lin_Manuel ok jomny I know you're e...	NaN	NaN	NaN	NaN	960	0	None	None	None	None	None
340	832215909146226688	NaN	NaN	2017-02-16 13:11:49 +0000	<a href="http://twitter.com/download/iphone" r...	RT @dog_rates: This is Logan, the Chow who liv...	7.867091e+17	4.196984e+09	2016-10-13 23:23:56 +0000	https://twitter.com/dog_rates/status/786709082...	75	10	Logan	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
313	835246439529840640	8.352460e+17	26259576.0	2017-02-24 21:54:03 +0000	<a href="http://twitter.com/download/iphone" r...	@jonnysun @Lin_Manuel ok jomny I know you're e...	NaN	NaN	NaN	NaN	960	0	None	None	None	None	None
516	810984652412424192	NaN	NaN	2016-12-19 23:06:23 +0000	<a href="http://twitter.com/download/iphone" r...	Meet Sam. She smiles 24/7 & secretly aspir...	NaN	NaN	NaN	https://www.gofundme.com/sams-smile,https://tw...	24	7	Sam	None	None	None	None
2335	666287406224695296	NaN	NaN	2015-11-16 16:11:11 +0000	<a href="http://twitter.com/download/iphone" r...	This is an Albanian 3 1/2 legged Episcopalian...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666287406...	1	2	an	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
22	887517139158093824	NaN	NaN	2017-07-19 03:39:09 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	I've yet to rate a Venezuelan Hover Wiener. This is such an honor. 14/10 paw-inspiring af (IG: roxy.thedoxy) https://t.co/20VrLAA8ba	NaN	NaN	NaN	https://twitter.com/dog_rates/status/887517139158093824/video/1	14	10	such	None	None	None	None
56	881536004380872706	NaN	NaN	2017-07-02 15:32:16 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Here is a pupper approaching maximum borkdrive. Zooming at never before seen speeds. 14/10 paw-inspiring af \n(IG: puffie_the_chow) https://t.co/ghXBIIeQZF	NaN	NaN	NaN	https://twitter.com/dog_rates/status/881536004380872706/video/1	14	10	a	None	None	pupper	None
118	869988702071779329	NaN	NaN	2017-05-31 18:47:24 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	RT @dog_rates: We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10…	8.591970e+17	4.196984e+09	2017-05-02 00:04:57 +0000	https://twitter.com/dog_rates/status/859196978902773760/video/1	12	10	quite	None	None	None	None
169	859196978902773760	NaN	NaN	2017-05-02 00:04:57 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10 https://t.co/g2nSyGenG9	NaN	NaN	NaN	https://twitter.com/dog_rates/status/859196978902773760/video/1	12	10	quite	None	None	None	None
193	855459453768019968	NaN	NaN	2017-04-21 16:33:22 +0000	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	Guys, we only rate dogs. This is quite clearly a bulbasaur. Please only send dogs. Thank you... 12/10 human used pet, it's super effective https://t.co/Xc7uj1C64x	NaN	NaN	NaN	https://twitter.com/dog_rates/status/855459453768019968/photo/1,https://twitter.com/dog_rates/status/855459453768019968/photo/1	12	10	quite	None	None	None	None

	rating_numerator	rating_denominator	text
342	11	15	@docmisterio account started on 11/15/15
433	84	70	The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd
784	9	11	RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/…
902	165	150	Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE
1068	9	11	After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ
1120	204	170	Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv
1165	4	20	Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a
1202	50	50	This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq
1228	99	90	Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1
1254	80	80	Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12
1274	45	50	From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK
1351	60	50	Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa
1433	44	40	Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ
1598	4	20	Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating
1634	143	130	Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3
1635	121	110	Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55
1662	7	11	This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5
1663	20	16	I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible
1779	144	120	IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq
1843	88	80	Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
7	890729181411237888	NaN	NaN	2017-07-28 00:22:40 +0000	<a href="http://twitter.com/download/iphone" r...	When you watch your owner call another dog a g...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/890729181...	13	10	None	None	None	None	None
12	889665388333682689	NaN	NaN	2017-07-25 01:55:32 +0000	<a href="http://twitter.com/download/iphone" r...	Here's a puppo that seems to be on the fence a...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/889665388...	13	10	None	None	None	None	puppo
23	887473957103951883	NaN	NaN	2017-07-19 00:47:34 +0000	<a href="http://twitter.com/download/iphone" r...	This is Canela. She attempted some fancy porch...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/887473957...	13	10	Canela	None	None	None	None
24	887343217045368832	NaN	NaN	2017-07-18 16:08:03 +0000	<a href="http://twitter.com/download/iphone" r...	You may not have known you needed to see this ...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/887343217...	13	10	None	None	None	None	None
25	887101392804085760	NaN	NaN	2017-07-18 00:07:08 +0000	<a href="http://twitter.com/download/iphone" r...	This... is a Jubilant Antarctic House Bear. We...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/887101392...	12	10	None	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
19	888202515573088257	NaN	NaN	2017-07-21 01:02:36 +0000	<a href="http://twitter.com/download/iphone" r...	RT @dog_rates: This is Canela. She attempted s...	8.874740e+17	4.196984e+09	2017-07-19 00:47:34 +0000	https://twitter.com/dog_rates/status/887473957...	13	10	Canela	None	None	None	None
32	886054160059072513	NaN	NaN	2017-07-15 02:45:48 +0000	<a href="http://twitter.com/download/iphone" r...	RT @Athletics: 12/10 #BATP https://t.co/WxwJmv...	8.860537e+17	1.960740e+07	2017-07-15 02:44:07 +0000	https://twitter.com/dog_rates/status/886053434...	12	10	None	None	None	None	None
36	885311592912609280	NaN	NaN	2017-07-13 01:35:06 +0000	<a href="http://twitter.com/download/iphone" r...	RT @dog_rates: This is Lilly. She just paralle...	8.305833e+17	4.196984e+09	2017-02-12 01:04:29 +0000	https://twitter.com/dog_rates/status/830583320...	13	10	Lilly	None	None	None	None
68	879130579576475649	NaN	NaN	2017-06-26 00:13:58 +0000	<a href="http://twitter.com/download/iphone" r...	RT @dog_rates: This is Emmy. She was adopted t...	8.780576e+17	4.196984e+09	2017-06-23 01:10:23 +0000	https://twitter.com/dog_rates/status/878057613...	14	10	Emmy	None	None	None	None
73	878404777348136964	NaN	NaN	2017-06-24 00:09:53 +0000	<a href="http://twitter.com/download/iphone" r...	RT @dog_rates: Meet Shadow. In an attempt to r...	8.782815e+17	4.196984e+09	2017-06-23 16:00:04 +0000	https://www.gofundme.com/3yd6y1c,https://twitt...	13	10	Shadow	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
209	852226086759018497	NaN	NaN	2017-04-12 18:25:07 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	Meet General. He wasn't content with the quality of his room. Requested to pupgrade, but was ignored. 14/10 look who just lost a customer https://t.co/NP5JW8LnmW	NaN	NaN	NaN	https://twitter.com/dog_rates/status/852226086759018497/video/1	14	10	General	None	None	None	None
270	841314665196081154	NaN	NaN	2017-03-13 15:47:01 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	This is Max. There's no way in h*ck you're taking his pacifier. Binky promises it's not happening. 13/10 very good stubborn boy https://t.co/9lVAqDEvZ5	NaN	NaN	NaN	https://twitter.com/dog_rates/status/841314665196081154/video/1	13	10	Max	None	None	None	None
335	832645525019123713	NaN	NaN	2017-02-17 17:38:57 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	There's going to be a dog terminal at JFK Airport. This is not a drill. 10/10 \nhttps://t.co/dp5h9bCwU7	NaN	NaN	NaN	http://us.blastingnews.com/news/2017/02/jfk-announces-its-first-ever-ark-oasis-animal-terminal-001480161.html?sbdht=_pM1QUzk3wsdTxcmMoRPV7FWYYlsNKcFRcYSY7OmeHnOXA4NtUM6PLQ2_	10	10	not	None	None	None	None
352	831315979191906304	NaN	NaN	2017-02-14 01:35:49 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	I couldn't make it to the #WKCDogShow BUT I have people there on the ground relaying me the finest pupper pics possible. 13/10 for all https://t.co/jd6lYhfdH4	NaN	NaN	NaN	https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1	13	10	None	None	None	pupper	None
375	828361771580813312	NaN	NaN	2017-02-05 21:56:51 +0000	<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>	Beebop and Doobert should start a band 12/10 would listen	NaN	NaN	NaN	NaN	12	10	None	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
1286	708400866336894977	NaN	NaN	2016-03-11 21:15:02 +0000	<a href="http://vine.co" rel="nofollow">Vine -...	RT if you are as ready for summer as this pup ...	NaN	NaN	NaN	https://vine.co/v/iHFqnjKVbIQ	12	10	None	None	None	None	None
1860	675489971617296384	NaN	NaN	2015-12-12 01:38:53 +0000	<a href="http://twitter.com/download/iphone" r...	RT until we find this dog. Clearly a cool dog ...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/675489971...	10	10	None	None	None	None	None

	index	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
1008	1186	718540630683709445	NaN	NaN	2016-04-08 20:46:50 +0000	iPhone	Get you a pup that can do both. 10/10 https://...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/718540630...	10	10	None	None	None	None	None
951	1129	729463711119904772	NaN	NaN	2016-05-09 00:11:16 +0000	iPhone	Meet Pupcasso. You can't afford his art. 13/10...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/729463711...	13	10	Pupcasso	None	None	None	None
1964	2143	669970042633789440	NaN	NaN	2015-11-26 20:04:40 +0000	iPhone	This is Julio. He was one of the original Ring...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/669970042...	10	10	Julio	None	None	None	None
803	978	749996283729883136	NaN	NaN	2016-07-04 16:00:22 +0000	TweetDeck	This is Bo. He emanates happiness. 12/10 I cou...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/749996283...	12	10	Bo	None	None	None	None
1494	1673	682303737705140231	NaN	NaN	2015-12-30 20:54:22 +0000	iPhone	This is Todo. He's screaming because he doesn'...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/682303737...	9	10	Todo	None	None	None	None

	index	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
494	614	796759840936919040	NaN	NaN	2016-11-10 17:02:03 +0000	iPhone	Say hello to Romeo. He was just told that it's...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/796759840...	11	10	Romeo	NaN	NaN	NaN	NaN
2151	2332	666345417576210432	NaN	NaN	2015-11-16 20:01:42 +0000	iPhone	Look at this jokester thinking seat belt laws ...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666345417...	10	10	NaN	NaN	NaN	NaN	NaN
1444	1623	684902183876321280	NaN	NaN	2016-01-07 00:59:40 +0000	iPhone	This is Perry. He's an Augustus Gloopster. Ver...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/684902183...	11	10	Perry	NaN	NaN	NaN	NaN
1524	1703	680913438424612864	NaN	NaN	2015-12-27 00:49:49 +0000	iPhone	Meet Griswold. He's dapper as hell. Already pu...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/680913438...	11	10	Griswold	NaN	NaN	NaN	NaN
1782	1961	673359818736984064	NaN	NaN	2015-12-06 04:34:25 +0000	iPhone	This is Steve. He was just relaxing in hot tub...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/673359818...	8	10	Steve	NaN	NaN	NaN	NaN

	p1	p1_dog	favorite_count
1683	Lakeland_terrier	True	143952
1218	Labrador_retriever	True	128764
1884	French_bulldog	True	124651
1593	Chihuahua	True	123459
1934	English_springer	True	106354
1659	Standard_poodle	True	94025
1857	Angora	False	92951
1899	Golden_retriever	True	83699
1591	Arabian_camel	False	82538
1927	Chesapeake_bay_retriever	True	80383
511	Bubble	False	79048
1959	Italian_greyhound	True	77876
1868	Chow	True	76730
1183	Eskimo_dog	True	73658
1727	Labrador_retriever	True	72236
1970	Pembroke	True	69403
1892	Laptop	False	66664
1985	Pomeranian	True	65817
569	Swing	False	61243
1720	Boxer	True	57279

	p1	p1_dog	rating
1267	Bow_tie	False	177.6
275	Microphone	False	42.0
1490	Pomeranian	True	7.5
1440	Clumber	True	2.7
619	Kuvasz	True	2.6

	p1	p2	p3
0	Welsh_springer_spaniel	Collie	Shetland_sheepdog
1	Redbone	Miniature_pinscher	Rhodesian_ridgeback
2	German_shepherd	Malinois	Bloodhound
3	Rhodesian_ridgeback	Redbone	Miniature_pinscher
4	Miniature_pinscher	Rottweiler	Doberman
5	Bernese_mountain_dog	English_springer	Greater_swiss_mountain_dog
6	Box_turtle	Mud_turtle	Terrapin
7	Chow	Tibetan_mastiff	Fur_coat
8	Shopping_cart	Shopping_basket	Golden_retriever
9	Miniature_poodle	Komondor	Soft-coated_wheaten_terrier
10	Golden_retriever	Tibetan_mastiff	Labrador_retriever

	favorite_count	p1	p1_dog	rating	retweet_count
1683	143952.0	Lakeland_terrier	True	NaN	NaN
1218	128764.0	Labrador_retriever	True	NaN	NaN
1884	124651.0	French_bulldog	True	NaN	NaN
1593	123459.0	Chihuahua	True	NaN	NaN
1934	106354.0	English_springer	True	NaN	NaN

Twitter Data Wrangling

Gathering data

Assessing data

Cleaning data

Analyzing and visualizing data

Source code:

Gathering data¶

Download the tweet image predictions¶

The WeRateDogs Twitter archive¶

Query Twitter API and read into dataframe¶

Assessing data¶

Quality¶

Tidiness¶

Cleaning data¶

Missing Data¶

image-predictions.tsv¶

twitter-archive-enhanced.csv¶

df2 (tweepy query result)¶

The missing data are not cleanable¶

Tidiness¶

Some of the records in twitter-archive-enhanced.csv are retweets¶

Define¶

Code¶

Test¶

Source in twitter-archive-enhanced.csv includes HTML code, url, and source.¶

Define¶

Code¶

Test¶

Quality¶

Mixed lower and upper case in p1/p2/p3 columns of image-predictions.tsv¶

Define¶

Code¶

Test¶

Inaccurate names in twitter-archive-enhanced.csv¶

Define¶

Code¶

Test¶

Missing values are presented by both "NaN" and "None".¶

Define¶

Code¶

Test¶

Incorrect values in rating (both rating_numerator and rating_denominator).¶

Define¶

Code¶

Test¶

The numbers in retweeted_status_id and retweeted_status_user_id should are also not correctly displayed.¶

This has been solved by cleaning tidiness issue 1 removing all the retweet data, but I would like to remove reweeted status and in reply to status columns here to keep the master dataframe slim¶

Define¶

Code¶

Test¶

Values in df2 are string not numbers¶

Define¶

Code¶

Test¶

Merging all three dataframes¶

Exporting master dataframe to CSV file¶

Analyzing and Visualizing Data¶

The breeds of dog ranked in the top 20 favorited tweets.¶

The breeds of dog ranked in the top 20 retweeted tweets.¶

The top rated dogs and their tweet info¶

Visualization¶

The most popular breeds of dog¶