import requests
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
Gathering data¶
Download the tweet image predictions¶
#download file
folder_name = 'download'
if not os.path.exists(folder_name):
os.makedirs(folder_name)
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
#print(response)
with open(os.path.join(folder_name, url.split('/')[-1]), mode='wb') as file:
file.write(response.content)
df = pd.read_csv('download/image-predictions.tsv', sep='\t')
df
tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
5 | 666050758794694657 | https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg | 1 | Bernese_mountain_dog | 0.651137 | True | English_springer | 0.263788 | True | Greater_Swiss_Mountain_dog | 0.016199 | True |
6 | 666051853826850816 | https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg | 1 | box_turtle | 0.933012 | False | mud_turtle | 0.045885 | False | terrapin | 0.017885 | False |
7 | 666055525042405380 | https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg | 1 | chow | 0.692517 | True | Tibetan_mastiff | 0.058279 | True | fur_coat | 0.054449 | False |
8 | 666057090499244032 | https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg | 1 | shopping_cart | 0.962465 | False | shopping_basket | 0.014594 | False | golden_retriever | 0.007959 | True |
9 | 666058600524156928 | https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg | 1 | miniature_poodle | 0.201493 | True | komondor | 0.192305 | True | soft-coated_wheaten_terrier | 0.082086 | True |
10 | 666063827256086533 | https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg | 1 | golden_retriever | 0.775930 | True | Tibetan_mastiff | 0.093718 | True | Labrador_retriever | 0.072427 | True |
... For full data please check my Github page ...
2075 rows × 12 columns
The WeRateDogs Twitter archive¶
#twitter-archive-enhanced.csv
df1 = pd.read_csv('twitter-archive-enhanced.csv')
df1
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | 13 | 10 | Darla | None | None | None | None |
4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | 12 | 10 | Franklin | None | None | None | None |
5 | 891087950875897856 | NaN | NaN | 2017-07-29 00:08:17 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a majestic great white breaching ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891087950... | 13 | 10 | None | None | None | None | None |
... For full data please check my Github page ...
2356 rows × 17 columns
Query Twitter API and read into dataframe¶
#tweepy
import tweepy
consumer_key = '-------------'
consumer_secret = '---------------'
access_token = '-----------------'
access_secret = '------------'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)
# public_tweets = api.home_timeline()
# for tweet in public_tweets:
# print(tweet.text)
8643 38970
line_count = 0
with open('tweet_json.txt', encoding='utf-8', mode = 'w') as file:
for i in range(len(df1)):
try:
result = api.get_status(str(df1.at[i,'tweet_id']))
print(i, df1.at[i,'tweet_id'])
file.write(str(result._json) + '\n')
line_count += 1
except tweepy.TweepError as e:
print(i, e.reason)
print(line_count)
0 892420643555336193 1 892177421306343426 2 891815181378084864 3 891689557279858688 4 891327558926688256 5 891087950875897856 6 890971913173991426 7 890729181411237888 8 890609185150312448 9 890240255349198849 10 890006608113172480 11 889880896479866881 12 889665388333682689 13 889638837579907072 14 889531135344209921 15 889278841981685760 16 888917238123831296 17 888804989199671297 18 888554962724278272 19 [{'code': 144, 'message': 'No status found with that ID.'}] 20 888078434458587136 21 887705289381826560 22 887517139158093824 23 887473957103951883 24 887343217045368832 25 887101392804085760 ....
#Synthesize df2 from tweet_json.txt
# I assigned the line_count value from previous cell here directly in
# order to run this cell individually since it will take some time to
# query twitter API
line_count = 2345
df2_list =[]
regex_id = '\'id\': ([0-9]+)'
regex_retweet = '\'retweet_count\': ([0-9]+)'
regex_fav = '\'favorite_count\': ([0-9]+)'
r_id = re.compile(regex_id)
r_ret = re.compile(regex_retweet)
r_fav = re.compile(regex_fav)
count = 0
with open('tweet_json.txt', encoding='utf-8') as file:
while count < line_count:
title = file.readline()
# line1 = title.split(',')
# id = line1[1][7:len(line1[1])]
idre = r_id.findall(title)
favc = r_fav.findall(title)
retc = r_ret.findall(title)
# fav_count = line1[len(line1)-6][19:len(line1[len(line1)-6])]
# ret_count = line1[len(line1)-7][18:len(line1[len(line1)-7])]
df2_list.append({'tweet_id' : str(idre[0]),
'retweet_count' : retc[0],
'favorite_count' : favc[0]})
#print(count, idre[0], favc[0], retc[0])
count += 1
df2 = pd.DataFrame(df2_list, columns = ['tweet_id', 'retweet_count', 'favorite_count'])
df2
tweet_id | retweet_count | favorite_count | |
---|---|---|---|
0 | 892420643555336193 | 8641 | 38967 |
1 | 892177421306343426 | 6351 | 33354 |
2 | 891815181378084864 | 4213 | 25136 |
3 | 891689557279858688 | 8759 | 42299 |
4 | 891327558926688256 | 9521 | 40472 |
5 | 891087950875897856 | 3155 | 20293 |
6 | 890971913173991426 | 2104 | 11900 |
7 | 890729181411237888 | 19157 | 65817 |
8 | 890609185150312448 | 4321 | 27876 |
9 | 890240255349198849 | 7524 | 32075 |
10 | 890006608113172480 | 7439 | 30775 |
... For full data please check my Github page ...
2345 rows × 3 columns
Assessing data¶
df[df.tweet_id.duplicated()]
tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog |
---|
df1[df1.tweet_id.duplicated()]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo |
---|
df2[df2.id.duplicated()]
id | retweet count | favorite_count |
---|
sum(df.tweet_id.isnull())
0
sum(df1.tweet_id.isnull())
0
sum(df1.rating_numerator.isnull())
0
sum(df2.id.isnull())
0
df1.rating_denominator.value_counts()
10 2333 11 3 50 3 80 2 20 2 2 1 16 1 40 1 70 1 15 1 90 1 110 1 120 1 130 1 150 1 170 1 7 1 0 1 Name: rating_denominator, dtype: int64
df1.rating_numerator.value_counts()
12 558 11 464 10 461 13 351 9 158 8 102 7 55 14 54 5 37 6 32 3 19 4 17 1 9 2 9 420 2 0 2 15 2 75 2 80 1 20 1 ...... Name: rating_numerator, dtype: int64
df1[(df1['rating_numerator'] >= 20)]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
188 | 855862651834028034 | 8.558616e+17 | 1.943518e+08 | 2017-04-22 19:15:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @dhmontgomery We also gave snoop dogg a 420/10... | NaN | NaN | NaN | NaN | 420 | 10 | None | None | None | None | None |
189 | 855860136149123072 | 8.558585e+17 | 1.361572e+07 | 2017-04-22 19:05:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @s8n You tried very hard to portray this good ... | NaN | NaN | NaN | NaN | 666 | 10 | None | None | None | None | None |
290 | 838150277551247360 | 8.381455e+17 | 2.195506e+07 | 2017-03-04 22:12:52 +0000 | <a href="http://twitter.com/download/iphone" r... | @markhoppus 182/10 | NaN | NaN | NaN | NaN | 182 | 10 | None | None | None | None | None |
313 | 835246439529840640 | 8.352460e+17 | 2.625958e+07 | 2017-02-24 21:54:03 +0000 | <a href="http://twitter.com/download/iphone" r... | @jonnysun @Lin_Manuel ok jomny I know you're e... | NaN | NaN | NaN | NaN | 960 | 0 | None | None | None | None | None |
340 | 832215909146226688 | NaN | NaN | 2017-02-16 13:11:49 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Logan, the Chow who liv... | 7.867091e+17 | 4.196984e+09 | 2016-10-13 23:23:56 +0000 | https://twitter.com/dog_rates/status/786709082... | 75 | 10 | Logan | None | None | None | None |
... For full data please check my Github page ...
df1[(df1['rating_denominator'] < 10)]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
313 | 835246439529840640 | 8.352460e+17 | 26259576.0 | 2017-02-24 21:54:03 +0000 | <a href="http://twitter.com/download/iphone" r... | @jonnysun @Lin_Manuel ok jomny I know you're e... | NaN | NaN | NaN | NaN | 960 | 0 | None | None | None | None | None |
516 | 810984652412424192 | NaN | NaN | 2016-12-19 23:06:23 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Sam. She smiles 24/7 & secretly aspir... | NaN | NaN | NaN | https://www.gofundme.com/sams-smile,https://tw... | 24 | 7 | Sam | None | None | None | None |
2335 | 666287406224695296 | NaN | NaN | 2015-11-16 16:11:11 +0000 | <a href="http://twitter.com/download/iphone" r... | This is an Albanian 3 1/2 legged Episcopalian... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666287406... | 1 | 2 | an | None | None | None | None |
pd.options.display.max_colwidth = -1
df1[(df1['name'].str.islower() == True)]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
22 | 887517139158093824 | NaN | NaN | 2017-07-19 03:39:09 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | I've yet to rate a Venezuelan Hover Wiener. This is such an honor. 14/10 paw-inspiring af (IG: roxy.thedoxy) https://t.co/20VrLAA8ba | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887517139158093824/video/1 | 14 | 10 | such | None | None | None | None |
56 | 881536004380872706 | NaN | NaN | 2017-07-02 15:32:16 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | Here is a pupper approaching maximum borkdrive. Zooming at never before seen speeds. 14/10 paw-inspiring af \n(IG: puffie_the_chow) https://t.co/ghXBIIeQZF | NaN | NaN | NaN | https://twitter.com/dog_rates/status/881536004380872706/video/1 | 14 | 10 | a | None | None | pupper | None |
118 | 869988702071779329 | NaN | NaN | 2017-05-31 18:47:24 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | RT @dog_rates: We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10… | 8.591970e+17 | 4.196984e+09 | 2017-05-02 00:04:57 +0000 | https://twitter.com/dog_rates/status/859196978902773760/video/1 | 12 | 10 | quite | None | None | None | None |
169 | 859196978902773760 | NaN | NaN | 2017-05-02 00:04:57 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10 https://t.co/g2nSyGenG9 | NaN | NaN | NaN | https://twitter.com/dog_rates/status/859196978902773760/video/1 | 12 | 10 | quite | None | None | None | None |
193 | 855459453768019968 | NaN | NaN | 2017-04-21 16:33:22 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | Guys, we only rate dogs. This is quite clearly a bulbasaur. Please only send dogs. Thank you... 12/10 human used pet, it's super effective https://t.co/Xc7uj1C64x | NaN | NaN | NaN | https://twitter.com/dog_rates/status/855459453768019968/photo/1,https://twitter.com/dog_rates/status/855459453768019968/photo/1 | 12 | 10 | quite | None | None | None | None |
... For full data please check my Github page ...
109 rows × 17 columns
rating_numerator | rating_denominator | text | |
---|---|---|---|
342 | 11 | 15 | @docmisterio account started on 11/15/15 |
433 | 84 | 70 | The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd |
784 | 9 | 11 | RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/… |
902 | 165 | 150 | Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE |
1068 | 9 | 11 | After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ |
1120 | 204 | 170 | Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv |
1165 | 4 | 20 | Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a |
1202 | 50 | 50 | This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq |
1228 | 99 | 90 | Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1 |
1254 | 80 | 80 | Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12 |
1274 | 45 | 50 | From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK |
1351 | 60 | 50 | Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa |
1433 | 44 | 40 | Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ |
1598 | 4 | 20 | Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating |
1634 | 143 | 130 | Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3 |
1635 | 121 | 110 | Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55 |
1662 | 7 | 11 | This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5 |
1663 | 20 | 16 | I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible |
1779 | 144 | 120 | IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq |
1843 | 88 | 80 | Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw |
df1[df1.name.duplicated()]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | 890729181411237888 | NaN | NaN | 2017-07-28 00:22:40 +0000 | <a href="http://twitter.com/download/iphone" r... | When you watch your owner call another dog a g... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/890729181... | 13 | 10 | None | None | None | None | None |
12 | 889665388333682689 | NaN | NaN | 2017-07-25 01:55:32 +0000 | <a href="http://twitter.com/download/iphone" r... | Here's a puppo that seems to be on the fence a... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/889665388... | 13 | 10 | None | None | None | None | puppo |
23 | 887473957103951883 | NaN | NaN | 2017-07-19 00:47:34 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Canela. She attempted some fancy porch... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
24 | 887343217045368832 | NaN | NaN | 2017-07-18 16:08:03 +0000 | <a href="http://twitter.com/download/iphone" r... | You may not have known you needed to see this ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887343217... | 13 | 10 | None | None | None | None | None |
25 | 887101392804085760 | NaN | NaN | 2017-07-18 00:07:08 +0000 | <a href="http://twitter.com/download/iphone" r... | This... is a Jubilant Antarctic House Bear. We... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887101392... | 12 | 10 | None | None | None | None | None |
... For full data please check my Github page ...
1399 rows × 17 columns
df1[df1.name.isnull()]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo |
---|
df1[(df1['text'].str.slice(start=0, stop=2) == 'RT')]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
19 | 888202515573088257 | NaN | NaN | 2017-07-21 01:02:36 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Canela. She attempted s... | 8.874740e+17 | 4.196984e+09 | 2017-07-19 00:47:34 +0000 | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
32 | 886054160059072513 | NaN | NaN | 2017-07-15 02:45:48 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @Athletics: 12/10 #BATP https://t.co/WxwJmv... | 8.860537e+17 | 1.960740e+07 | 2017-07-15 02:44:07 +0000 | https://twitter.com/dog_rates/status/886053434... | 12 | 10 | None | None | None | None | None |
36 | 885311592912609280 | NaN | NaN | 2017-07-13 01:35:06 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Lilly. She just paralle... | 8.305833e+17 | 4.196984e+09 | 2017-02-12 01:04:29 +0000 | https://twitter.com/dog_rates/status/830583320... | 13 | 10 | Lilly | None | None | None | None |
68 | 879130579576475649 | NaN | NaN | 2017-06-26 00:13:58 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Emmy. She was adopted t... | 8.780576e+17 | 4.196984e+09 | 2017-06-23 01:10:23 +0000 | https://twitter.com/dog_rates/status/878057613... | 14 | 10 | Emmy | None | None | None | None |
73 | 878404777348136964 | NaN | NaN | 2017-06-24 00:09:53 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: Meet Shadow. In an attempt to r... | 8.782815e+17 | 4.196984e+09 | 2017-06-23 16:00:04 +0000 | https://www.gofundme.com/3yd6y1c,https://twitt... | 13 | 10 | Shadow | None | None | None | None |
... For full data please check my Github page ...
183 rows × 17 columns
df1[df1.retweeted_status_id.notnull()]
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
19 | 888202515573088257 | NaN | NaN | 2017-07-21 01:02:36 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Canela. She attempted s... | 8.874740e+17 | 4.196984e+09 | 2017-07-19 00:47:34 +0000 | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
32 | 886054160059072513 | NaN | NaN | 2017-07-15 02:45:48 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @Athletics: 12/10 #BATP https://t.co/WxwJmv... | 8.860537e+17 | 1.960740e+07 | 2017-07-15 02:44:07 +0000 | https://twitter.com/dog_rates/status/886053434... | 12 | 10 | None | None | None | None | None |
36 | 885311592912609280 | NaN | NaN | 2017-07-13 01:35:06 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Lilly. She just paralle... | 8.305833e+17 | 4.196984e+09 | 2017-02-12 01:04:29 +0000 | https://twitter.com/dog_rates/status/830583320... | 13 | 10 | Lilly | None | None | None | None |
68 | 879130579576475649 | NaN | NaN | 2017-06-26 00:13:58 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Emmy. She was adopted t... | 8.780576e+17 | 4.196984e+09 | 2017-06-23 01:10:23 +0000 | https://twitter.com/dog_rates/status/878057613... | 14 | 10 | Emmy | None | None | None | None |
73 | 878404777348136964 | NaN | NaN | 2017-06-24 00:09:53 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: Meet Shadow. In an attempt to r... | 8.782815e+17 | 4.196984e+09 | 2017-06-23 16:00:04 +0000 | https://www.gofundme.com/3yd6y1c,https://twitt... | 13 | 10 | Shadow | None | None | None | None |
... For full data please check my Github page ...
181 rows × 17 columns
df1_clean['name'].describe()
count 2247 unique 932 top None freq 745 Name: name, dtype: object
pd.options.display.max_colwidth = -1
df1[(df1['source'].str.slice(start=72, stop=-4) != 'iPhone')]
# df1['source'].str.slice(start=72, stop=-4)
# df1['source'].str.slice(start=0, stop=-4)
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
209 | 852226086759018497 | NaN | NaN | 2017-04-12 18:25:07 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | Meet General. He wasn't content with the quality of his room. Requested to pupgrade, but was ignored. 14/10 look who just lost a customer https://t.co/NP5JW8LnmW | NaN | NaN | NaN | https://twitter.com/dog_rates/status/852226086759018497/video/1 | 14 | 10 | General | None | None | None | None |
270 | 841314665196081154 | NaN | NaN | 2017-03-13 15:47:01 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | This is Max. There's no way in h*ck you're taking his pacifier. Binky promises it's not happening. 13/10 very good stubborn boy https://t.co/9lVAqDEvZ5 | NaN | NaN | NaN | https://twitter.com/dog_rates/status/841314665196081154/video/1 | 13 | 10 | Max | None | None | None | None |
335 | 832645525019123713 | NaN | NaN | 2017-02-17 17:38:57 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | There's going to be a dog terminal at JFK Airport. This is not a drill. 10/10 \nhttps://t.co/dp5h9bCwU7 | NaN | NaN | NaN | http://us.blastingnews.com/news/2017/02/jfk-announces-its-first-ever-ark-oasis-animal-terminal-001480161.html?sbdht=_pM1QUzk3wsdTxcmMoRPV7FWYYlsNKcFRcYSY7OmeHnOXA4NtUM6PLQ2_ | 10 | 10 | not | None | None | None | None |
352 | 831315979191906304 | NaN | NaN | 2017-02-14 01:35:49 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | I couldn't make it to the #WKCDogShow BUT I have people there on the ground relaying me the finest pupper pics possible. 13/10 for all https://t.co/jd6lYhfdH4 | NaN | NaN | NaN | https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1 | 13 | 10 | None | None | None | pupper | None |
375 | 828361771580813312 | NaN | NaN | 2017-02-05 21:56:51 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | Beebop and Doobert should start a band 12/10 would listen | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
... For full data please check my Github page ...
135 rows × 17 columns
df2['tweet_id'].describe()
count 2345 unique 2345 top 778650543019483137 freq 1 Name: tweet_id, dtype: object
Quality¶
image-predictions.tsv
- Some values in P1, P2, and P3 begin with lower case, but some begin with upper case.
- Missing records (2075 vs 2356 records in twitter-archive-enhanced.csv ).?
twitter-archive-enhanced.csv
- Incomplete data - missing value in multiple columns.
- Incorrect values in name (for example, such, an, a ...).
- Missing values are presented by both "NaN" and "None".
- Incorrect values in rating (both rating_numerator and rating_denominator).
- The numbers in retweeted_status_id and retweeted_status_user_id should are also not correctly displayed.
df2
- Missing 11 data records (error "No status found with that ID" when querying twitter API).
- Values are string not integer.
Tidiness¶
twitter-archive-enhanced.csv
- Source contains HTML code.
- Some records are for retweets
Cleaning data¶
df_clean = df.copy()
df1_clean = df1.copy()
df2_clean = df2.copy()
Missing Data¶
image-predictions.tsv¶
- Missing records (2075 vs 2356 records in twitter-archive-enhanced.csv ).
twitter-archive-enhanced.csv¶
- Incomplete data - missing value in multiple columns.
df2 (tweepy query result)¶
- Missing 11 data records (error "No status found with that ID" when querying twitter API).
The missing data are not cleanable¶
Tidiness¶
Some of the records in twitter-archive-enhanced.csv are retweets¶
Define¶
Remove the records that are for retweets in twitter-archive-enhanced.csv.
Code¶
df1_clean = df1_clean.loc[~(df1_clean.retweeted_status_id.notnull())]
Test¶
df1_clean[(df1_clean['text'].str.slice(start=0, stop=2) == 'RT')]
# df1_clean
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1286 | 708400866336894977 | NaN | NaN | 2016-03-11 21:15:02 +0000 | <a href="http://vine.co" rel="nofollow">Vine -... | RT if you are as ready for summer as this pup ... | NaN | NaN | NaN | https://vine.co/v/iHFqnjKVbIQ | 12 | 10 | None | None | None | None | None |
1860 | 675489971617296384 | NaN | NaN | 2015-12-12 01:38:53 +0000 | <a href="http://twitter.com/download/iphone" r... | RT until we find this dog. Clearly a cool dog ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/675489971... | 10 | 10 | None | None | None | None | None |
Source in twitter-archive-enhanced.csv includes HTML code, url, and source.¶
Define¶
Extract the source string from HTML code
Code¶
df1_clean = df1_clean.reset_index()
df1_clean['source'] = df1_clean.source.str.extract('([a-zA-Z0-9-]+\b[a-zA-Z0-9-]+|[a-zA-Z0-9-]+)</a>', expand=True)
for i in range(len(df1_clean)):
if df1_clean.loc[i, 'source'] == "Client":
df1_clean.loc[i, 'source'] = "Twitter Web Client"
if df1_clean.loc[i, 'source'] == "Scene":
df1_clean.loc[i, 'source'] = "Vine"
Test¶
df1_clean.source.value_counts()
iPhone 2042 Vine 91 Twitter Web Client 31 TweetDeck 11 Name: source, dtype: int64
Quality¶
Mixed lower and upper case in p1/p2/p3 columns of image-predictions.tsv¶
Define¶
Change values in p1/p2/p3 to beign with upper case and followed by lower case letters.
Code¶
df_col = ['p1', 'p2', 'p3']
for i in range(3):
df_clean[df_col[i]] = df_clean[df_col[i]].str.capitalize()
Test¶
df_clean[['p1', 'p2', 'p3']]
p1 | p2 | p3 | |
---|---|---|---|
0 | Welsh_springer_spaniel | Collie | Shetland_sheepdog |
1 | Redbone | Miniature_pinscher | Rhodesian_ridgeback |
2 | German_shepherd | Malinois | Bloodhound |
3 | Rhodesian_ridgeback | Redbone | Miniature_pinscher |
4 | Miniature_pinscher | Rottweiler | Doberman |
5 | Bernese_mountain_dog | English_springer | Greater_swiss_mountain_dog |
6 | Box_turtle | Mud_turtle | Terrapin |
7 | Chow | Tibetan_mastiff | Fur_coat |
8 | Shopping_cart | Shopping_basket | Golden_retriever |
9 | Miniature_poodle | Komondor | Soft-coated_wheaten_terrier |
10 | Golden_retriever | Tibetan_mastiff | Labrador_retriever |
... For full data please check my Github page ...
2075 rows × 3 columns
Inaccurate names in twitter-archive-enhanced.csv¶
Define¶
Extract the name from text column and replace with the wrong name in the name column.
Code¶
regex_name2 = 'named ([A-Z][a-z]+)|is ([A-Z][a-z]+)'
r_name2 = re.compile(regex_name2)
for i in range(len(df1_clean)):
if df1_clean.loc[i, 'name'].islower():
df1_text = df1_clean.loc[i, 'text']
df1_name = r_name2.findall(df1_text)
if len(df1_name) == 1:
if df1_name[0][0]:
df1_clean.at[i, 'name'] = df1_name[0][0]
print(df1_name)
if df1_name[0][1]:
df1_clean.at[i, 'name'] = df1_name[0][1]
print(df1_name)
if len(df1_name) == 0:
df1_clean.at[i, 'name'] = "None"
[('', 'Freudian')] [('', 'Arctic')] [('', 'Zoey')] [('', 'Quizno')] [('', 'Jamaican')] [('', 'Alaskan')] [('', 'Bulgarian')] [('', 'Dutch')] [('', 'Mongolian')] [('Wylie', '')] [('Kip', '')] [('Jacob', '')] [('Rufus', '')] [('Spork', '')] [('Cherokee', '')] [('Hemry', '')] [('Alphred', '')] [('Alfredo', '')] [('Leroi', '')] [('Berta', '')] [('Chuk', '')] [('Alfonso', '')] [('Cheryl', '')] [('Jessiga', '')] [('Klint', '')] [('Kohl', '')] [('', 'Daryl')] [('Pepe', '')] [('Octaviath', '')] [('Johm', '')]
Test¶
df1_clean[(df1_clean['tweet_id'] == 666701168228331520)]
index | tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2133 | 2314 | 666701168228331520 | NaN | NaN | 2015-11-17 19:35:19 +0000 | iPhone | This is a golden Buckminsterfullerene named Jo... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666701168... | 8 | 10 | Johm | None | None | None | None |
df1_clean[(df1_clean['tweet_id'] == 748692773788876800)]
index | tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
817 | 992 | 748692773788876800 | NaN | NaN | 2016-07-01 01:40:41 +0000 | iPhone | That is Quizno. This is his beach. He does not... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/748692773... | 10 | 10 | Quizno | doggo | None | None | None |
df1_clean.sample(200)
index | tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1008 | 1186 | 718540630683709445 | NaN | NaN | 2016-04-08 20:46:50 +0000 | iPhone | Get you a pup that can do both. 10/10 https://... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/718540630... | 10 | 10 | None | None | None | None | None |
951 | 1129 | 729463711119904772 | NaN | NaN | 2016-05-09 00:11:16 +0000 | iPhone | Meet Pupcasso. You can't afford his art. 13/10... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/729463711... | 13 | 10 | Pupcasso | None | None | None | None |
1964 | 2143 | 669970042633789440 | NaN | NaN | 2015-11-26 20:04:40 +0000 | iPhone | This is Julio. He was one of the original Ring... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/669970042... | 10 | 10 | Julio | None | None | None | None |
803 | 978 | 749996283729883136 | NaN | NaN | 2016-07-04 16:00:22 +0000 | TweetDeck | This is Bo. He emanates happiness. 12/10 I cou... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/749996283... | 12 | 10 | Bo | None | None | None | None |
1494 | 1673 | 682303737705140231 | NaN | NaN | 2015-12-30 20:54:22 +0000 | iPhone | This is Todo. He's screaming because he doesn'... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/682303737... | 9 | 10 | Todo | None | None | None | None |
... For full data please check my Github page ...
200 rows × 18 columns
Missing values are presented by both "NaN" and "None".¶
Define¶
Replace the "None" in twitter-archive-enhanced.csv with "NaN".
Code¶
df1_clean.replace("None", np.nan, inplace=True)
Test¶
df1_clean.sample(200)
index | tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
494 | 614 | 796759840936919040 | NaN | NaN | 2016-11-10 17:02:03 +0000 | iPhone | Say hello to Romeo. He was just told that it's... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/796759840... | 11 | 10 | Romeo | NaN | NaN | NaN | NaN |
2151 | 2332 | 666345417576210432 | NaN | NaN | 2015-11-16 20:01:42 +0000 | iPhone | Look at this jokester thinking seat belt laws ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666345417... | 10 | 10 | NaN | NaN | NaN | NaN | NaN |
1444 | 1623 | 684902183876321280 | NaN | NaN | 2016-01-07 00:59:40 +0000 | iPhone | This is Perry. He's an Augustus Gloopster. Ver... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/684902183... | 11 | 10 | Perry | NaN | NaN | NaN | NaN |
1524 | 1703 | 680913438424612864 | NaN | NaN | 2015-12-27 00:49:49 +0000 | iPhone | Meet Griswold. He's dapper as hell. Already pu... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/680913438... | 11 | 10 | Griswold | NaN | NaN | NaN | NaN |
1782 | 1961 | 673359818736984064 | NaN | NaN | 2015-12-06 04:34:25 +0000 | iPhone | This is Steve. He was just relaxing in hot tub... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/673359818... | 8 | 10 | Steve | NaN | NaN | NaN | NaN |
... For full data please check my Github page ...
200 rows × 18 columns
Incorrect values in rating (both rating_numerator and rating_denominator).¶
Define¶
List all the ratings with denominator not equal to 10, and correct the wrong ratings. Remove those records that doesn't have a rating.
Code¶
pd.options.display.max_colwidth = -1
df1_clean[['rating_numerator', 'rating_denominator', 'text' ]][(df1_clean['rating_denominator'] != 10)]
rating_numerator | rating_denominator | text | |
---|---|---|---|
263 | 960 | 0 | @jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho |
287 | 11 | 15 | @docmisterio account started on 11/15/15 |
363 | 84 | 70 | The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd |
429 | 24 | 7 | Meet Sam. She smiles 24/7 & secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx |
733 | 165 | 150 | Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE |
890 | 9 | 11 | After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ |
942 | 204 | 170 | Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv |
987 | 4 | 20 | Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a |
1024 | 50 | 50 | This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq |
1050 | 99 | 90 | Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1 |
1075 | 80 | 80 | Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12 |
1095 | 45 | 50 | From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK |
1172 | 60 | 50 | Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa |
1254 | 44 | 40 | Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ |
1419 | 4 | 20 | Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating |
1455 | 143 | 130 | Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3 |
1456 | 121 | 110 | Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55 |
1483 | 7 | 11 | This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5 |
1484 | 20 | 16 | I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible |
1600 | 144 | 120 | IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq |
1664 | 88 | 80 | Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw |
2154 | 1 | 2 | This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv |
df1_clean.loc[263, 'rating_numerator'] = 13
df1_clean.loc[263, 'rating_denominator'] = 10
#set both numerator and denominator to 0 if there is no rating in the text.
df1_clean.loc[287, 'rating_numerator'] = 0
df1_clean.loc[287, 'rating_denominator'] = 0
df1_clean.loc[429, 'rating_numerator'] = 0
df1_clean.loc[429, 'rating_denominator'] = 0
df1_clean.loc[890, 'rating_numerator'] = 14
df1_clean.loc[890, 'rating_denominator'] = 10
df1_clean.loc[987, 'rating_numerator'] = 13
df1_clean.loc[987, 'rating_denominator'] = 10
df1_clean.loc[1483, 'rating_numerator'] = 10
df1_clean.loc[1483, 'rating_denominator'] = 10
df1_clean.loc[2154, 'rating_numerator'] = 9
df1_clean.loc[2154, 'rating_denominator'] = 10
#remove those records that doesn't have a rating.
df1_clean = df1_clean.loc[(df1_clean['rating_denominator'] != 0)]
Test¶
df1_clean.rating_denominator.value_counts()
10 2158 50 3 80 2 170 1 150 1 130 1 120 1 110 1 90 1 70 1 40 1 20 1 16 1 Name: rating_denominator, dtype: int64
The numbers in retweeted_status_id and retweeted_status_user_id should are also not correctly displayed.¶
This has been solved by cleaning tidiness issue 1 removing all the retweet data, but I would like to remove reweeted status and in reply to status columns here to keep the master dataframe slim¶
Define¶
Remove in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, and retweeted_status_timestamp columns
Code¶
df1_clean = df1_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis = 1)
Test¶
df1_clean
index | tweet_id | timestamp | source | text | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 892420643555336193 | 2017-08-01 16:23:56 +0000 | iPhone | This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU | https://twitter.com/dog_rates/status/892420643555336193/photo/1 | 13 | 10 | Phineas | NaN | NaN | NaN | NaN |
1 | 1 | 892177421306343426 | 2017-08-01 00:17:27 +0000 | iPhone | This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV | https://twitter.com/dog_rates/status/892177421306343426/photo/1 | 13 | 10 | Tilly | NaN | NaN | NaN | NaN |
2 | 2 | 891815181378084864 | 2017-07-31 00:18:03 +0000 | iPhone | This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB | https://twitter.com/dog_rates/status/891815181378084864/photo/1 | 12 | 10 | Archie | NaN | NaN | NaN | NaN |
3 | 3 | 891689557279858688 | 2017-07-30 15:58:51 +0000 | iPhone | This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ | https://twitter.com/dog_rates/status/891689557279858688/photo/1 | 13 | 10 | Darla | NaN | NaN | NaN | NaN |
4 | 4 | 891327558926688256 | 2017-07-29 16:00:24 +0000 | iPhone | This is Franklin. He would like you to stop calling him "cute." He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f | https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1 | 12 | 10 | Franklin | NaN | NaN | NaN | NaN |
... For full data please check my Github page ...
2173 rows × 13 columns
Values in df2 are string not numbers¶
Define¶
Convert the columns in df2 to int
Code¶
df2_clean['tweet_id'] = df2_clean['tweet_id'].astype(int)
df2_clean['retweet_count'] = df2_clean['retweet_count'].astype(int)
df2_clean['favorite_count'] = df2_clean['favorite_count'].astype(int)
Test¶
df2_clean['tweet_id'].describe()
count 2.345000e+03 mean 7.422940e+17 std 6.833642e+16 min 6.660209e+17 25% 6.783802e+17 50% 7.189392e+17 75% 7.986979e+17 max 8.924206e+17 Name: tweet_id, dtype: float64
Merging all three dataframes¶
df_master_clean = pd.merge(df_clean, df1_clean, on=['tweet_id'])
df_master_clean = pd.merge(df_master_clean, df2_clean, on=['tweet_id'])
df_master_clean
tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | ... | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | retweet_count | favorite_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | Collie | 0.156665 | True | Shetland_sheepdog | ... | https://twitter.com/dog_rates/status/666020888022790149/photo/1 | 8 | 10 | NaN | NaN | NaN | NaN | NaN | 521 | 2560 |
1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | Redbone | 0.506826 | True | Miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | ... | https://twitter.com/dog_rates/status/666029285002620928/photo/1 | 7 | 10 | NaN | NaN | NaN | NaN | NaN | 47 | 131 |
2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | Malinois | 0.138584 | True | Bloodhound | ... | https://twitter.com/dog_rates/status/666033412701032449/photo/1 | 9 | 10 | NaN | NaN | NaN | NaN | NaN | 45 | 125 |
3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | Redbone | 0.360687 | True | Miniature_pinscher | ... | https://twitter.com/dog_rates/status/666044226329800704/photo/1 | 6 | 10 | NaN | NaN | NaN | NaN | NaN | 141 | 302 |
4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | Miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | ... | https://twitter.com/dog_rates/status/666049248165822465/photo/1 | 5 | 10 | NaN | NaN | NaN | NaN | NaN | 40 | 109 |
5 | 666050758794694657 | https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg | 1 | Bernese_mountain_dog | 0.651137 | True | English_springer | 0.263788 | True | Greater_swiss_mountain_dog | ... | https://twitter.com/dog_rates/status/666050758794694657/photo/1 | 10 | 10 | NaN | NaN | NaN | NaN | NaN | 58 | 133 |
... For full data please check my Github page ...
1993 rows × 26 columns
Exporting master dataframe to CSV file¶
df_master_clean.to_csv('twitter_archive_master.csv')
Analyzing and Visualizing Data¶
df_master = pd.read_csv('twitter_archive_master.csv')
The breeds of dog ranked in the top 20 favorited tweets.¶
#The breeds of dog in the top 20 favorited tweets.
df_top_fav = df_master.sort_values('favorite_count', ascending=False).head(20)[['p1', 'p1_dog', 'favorite_count']]
df_top_fav
p1 | p1_dog | favorite_count | |
---|---|---|---|
1683 | Lakeland_terrier | True | 143952 |
1218 | Labrador_retriever | True | 128764 |
1884 | French_bulldog | True | 124651 |
1593 | Chihuahua | True | 123459 |
1934 | English_springer | True | 106354 |
1659 | Standard_poodle | True | 94025 |
1857 | Angora | False | 92951 |
1899 | Golden_retriever | True | 83699 |
1591 | Arabian_camel | False | 82538 |
1927 | Chesapeake_bay_retriever | True | 80383 |
511 | Bubble | False | 79048 |
1959 | Italian_greyhound | True | 77876 |
1868 | Chow | True | 76730 |
1183 | Eskimo_dog | True | 73658 |
1727 | Labrador_retriever | True | 72236 |
1970 | Pembroke | True | 69403 |
1892 | Laptop | False | 66664 |
1985 | Pomeranian | True | 65817 |
569 | Swing | False | 61243 |
1720 | Boxer | True | 57279 |
The breeds of dog ranked in the top 20 retweeted tweets.¶
#The breeds of dog in the top 20 retweeted tweets.
df_top_ret = df_master.sort_values('retweet_count', ascending=False).head(20)[['p1', 'p1_dog', 'retweet_count']]
df_top_ret
p1 | p1_dog | retweet_count | |
---|---|---|---|
1218 | Labrador_retriever | True | 77716 |
1593 | Chihuahua | True | 61444 |
1183 | Eskimo_dog | True | 51148 |
1683 | Lakeland_terrier | True | 49407 |
1934 | English_springer | True | 44828 |
... For full data please contact me ...
The top rated dogs and their tweet info¶
df_top_rated = df_master[['p1', 'p1_dog', 'rating']][(df_master['rating'] >= 1.4)].sort_values('rating', ascending=False)
df_top_rated
p1 | p1_dog | rating | |
---|---|---|---|
1267 | Bow_tie | False | 177.6 |
275 | Microphone | False | 42.0 |
1490 | Pomeranian | True | 7.5 |
1440 | Clumber | True | 2.7 |
619 | Kuvasz | True | 2.6 |
... For full data please contact me ...
Visualization¶
The most popular breeds of dog¶
df_top_dogs = df_top_fav.append(df_top_ret).append(df_top_rated)
df_top_dogs = df_top_dogs[(df_top_dogs['p1_dog'] == True)]
df_top_dogs
favorite_count | p1 | p1_dog | rating | retweet_count | |
---|---|---|---|---|---|
1683 | 143952.0 | Lakeland_terrier | True | NaN | NaN |
1218 | 128764.0 | Labrador_retriever | True | NaN | NaN |
1884 | 124651.0 | French_bulldog | True | NaN | NaN |
1593 | 123459.0 | Chihuahua | True | NaN | NaN |
1934 | 106354.0 | English_springer | True | NaN | NaN |
... For full data please contact me ...
df_top_dogs.p1.value_counts().sort_values(ascending=True).plot(kind='barh', figsize=(8, 8), rot=0)
# plt.axis('off')
# plt.axis('equal')
plt.title('The most popular breed found')
Text(0.5,1,'The most popular breed found')
#This part is not in the report
df_master.source.value_counts()
iPhone 1954 Twitter Web Client 28 TweetDeck 11 Name: source, dtype: int64
df_master.source.value_counts().plot(kind='pie', figsize=(8, 8), autopct='%.2f')
plt.axis('off')
plt.axis('equal')
plt.title('Source pie chart')
Text(0.5,1,'Source pie chart')