import requests
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
Gathering data¶
Download the tweet image predictions¶
#download file
folder_name = 'download'
if not os.path.exists(folder_name):
os.makedirs(folder_name)
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
#print(response)
with open(os.path.join(folder_name, url.split('/')[-1]), mode='wb') as file:
file.write(response.content)
df = pd.read_csv('download/image-predictions.tsv', sep='\t')
df
| tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
| 1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
| 2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
| 3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
| 4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
| 5 | 666050758794694657 | https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg | 1 | Bernese_mountain_dog | 0.651137 | True | English_springer | 0.263788 | True | Greater_Swiss_Mountain_dog | 0.016199 | True |
| 6 | 666051853826850816 | https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg | 1 | box_turtle | 0.933012 | False | mud_turtle | 0.045885 | False | terrapin | 0.017885 | False |
| 7 | 666055525042405380 | https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg | 1 | chow | 0.692517 | True | Tibetan_mastiff | 0.058279 | True | fur_coat | 0.054449 | False |
| 8 | 666057090499244032 | https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg | 1 | shopping_cart | 0.962465 | False | shopping_basket | 0.014594 | False | golden_retriever | 0.007959 | True |
| 9 | 666058600524156928 | https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg | 1 | miniature_poodle | 0.201493 | True | komondor | 0.192305 | True | soft-coated_wheaten_terrier | 0.082086 | True |
| 10 | 666063827256086533 | https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg | 1 | golden_retriever | 0.775930 | True | Tibetan_mastiff | 0.093718 | True | Labrador_retriever | 0.072427 | True |
... For full data please check my Github page ...
2075 rows × 12 columns
The WeRateDogs Twitter archive¶
#twitter-archive-enhanced.csv
df1 = pd.read_csv('twitter-archive-enhanced.csv')
df1
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
| 1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
| 2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
| 3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | 13 | 10 | Darla | None | None | None | None |
| 4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | 12 | 10 | Franklin | None | None | None | None |
| 5 | 891087950875897856 | NaN | NaN | 2017-07-29 00:08:17 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a majestic great white breaching ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891087950... | 13 | 10 | None | None | None | None | None |
... For full data please check my Github page ...
2356 rows × 17 columns
Query Twitter API and read into dataframe¶
#tweepy
import tweepy
consumer_key = '-------------'
consumer_secret = '---------------'
access_token = '-----------------'
access_secret = '------------'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)
# public_tweets = api.home_timeline()
# for tweet in public_tweets:
# print(tweet.text)
8643 38970
line_count = 0
with open('tweet_json.txt', encoding='utf-8', mode = 'w') as file:
for i in range(len(df1)):
try:
result = api.get_status(str(df1.at[i,'tweet_id']))
print(i, df1.at[i,'tweet_id'])
file.write(str(result._json) + '\n')
line_count += 1
except tweepy.TweepError as e:
print(i, e.reason)
print(line_count)
0 892420643555336193
1 892177421306343426
2 891815181378084864
3 891689557279858688
4 891327558926688256
5 891087950875897856
6 890971913173991426
7 890729181411237888
8 890609185150312448
9 890240255349198849
10 890006608113172480
11 889880896479866881
12 889665388333682689
13 889638837579907072
14 889531135344209921
15 889278841981685760
16 888917238123831296
17 888804989199671297
18 888554962724278272
19 [{'code': 144, 'message': 'No status found with that ID.'}]
20 888078434458587136
21 887705289381826560
22 887517139158093824
23 887473957103951883
24 887343217045368832
25 887101392804085760
....
#Synthesize df2 from tweet_json.txt
# I assigned the line_count value from previous cell here directly in
# order to run this cell individually since it will take some time to
# query twitter API
line_count = 2345
df2_list =[]
regex_id = '\'id\': ([0-9]+)'
regex_retweet = '\'retweet_count\': ([0-9]+)'
regex_fav = '\'favorite_count\': ([0-9]+)'
r_id = re.compile(regex_id)
r_ret = re.compile(regex_retweet)
r_fav = re.compile(regex_fav)
count = 0
with open('tweet_json.txt', encoding='utf-8') as file:
while count < line_count:
title = file.readline()
# line1 = title.split(',')
# id = line1[1][7:len(line1[1])]
idre = r_id.findall(title)
favc = r_fav.findall(title)
retc = r_ret.findall(title)
# fav_count = line1[len(line1)-6][19:len(line1[len(line1)-6])]
# ret_count = line1[len(line1)-7][18:len(line1[len(line1)-7])]
df2_list.append({'tweet_id' : str(idre[0]),
'retweet_count' : retc[0],
'favorite_count' : favc[0]})
#print(count, idre[0], favc[0], retc[0])
count += 1
df2 = pd.DataFrame(df2_list, columns = ['tweet_id', 'retweet_count', 'favorite_count'])
df2
| tweet_id | retweet_count | favorite_count | |
|---|---|---|---|
| 0 | 892420643555336193 | 8641 | 38967 |
| 1 | 892177421306343426 | 6351 | 33354 |
| 2 | 891815181378084864 | 4213 | 25136 |
| 3 | 891689557279858688 | 8759 | 42299 |
| 4 | 891327558926688256 | 9521 | 40472 |
| 5 | 891087950875897856 | 3155 | 20293 |
| 6 | 890971913173991426 | 2104 | 11900 |
| 7 | 890729181411237888 | 19157 | 65817 |
| 8 | 890609185150312448 | 4321 | 27876 |
| 9 | 890240255349198849 | 7524 | 32075 |
| 10 | 890006608113172480 | 7439 | 30775 |
... For full data please check my Github page ...
2345 rows × 3 columns
Assessing data¶
df[df.tweet_id.duplicated()]
| tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog |
|---|
df1[df1.tweet_id.duplicated()]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo |
|---|
df2[df2.id.duplicated()]
| id | retweet count | favorite_count |
|---|
sum(df.tweet_id.isnull())
0
sum(df1.tweet_id.isnull())
0
sum(df1.rating_numerator.isnull())
0
sum(df2.id.isnull())
0
df1.rating_denominator.value_counts()
10 2333 11 3 50 3 80 2 20 2 2 1 16 1 40 1 70 1 15 1 90 1 110 1 120 1 130 1 150 1 170 1 7 1 0 1 Name: rating_denominator, dtype: int64
df1.rating_numerator.value_counts()
12 558 11 464 10 461 13 351 9 158 8 102 7 55 14 54 5 37 6 32 3 19 4 17 1 9 2 9 420 2 0 2 15 2 75 2 80 1 20 1 ...... Name: rating_numerator, dtype: int64
df1[(df1['rating_numerator'] >= 20)]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 188 | 855862651834028034 | 8.558616e+17 | 1.943518e+08 | 2017-04-22 19:15:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @dhmontgomery We also gave snoop dogg a 420/10... | NaN | NaN | NaN | NaN | 420 | 10 | None | None | None | None | None |
| 189 | 855860136149123072 | 8.558585e+17 | 1.361572e+07 | 2017-04-22 19:05:32 +0000 | <a href="http://twitter.com/download/iphone" r... | @s8n You tried very hard to portray this good ... | NaN | NaN | NaN | NaN | 666 | 10 | None | None | None | None | None |
| 290 | 838150277551247360 | 8.381455e+17 | 2.195506e+07 | 2017-03-04 22:12:52 +0000 | <a href="http://twitter.com/download/iphone" r... | @markhoppus 182/10 | NaN | NaN | NaN | NaN | 182 | 10 | None | None | None | None | None |
| 313 | 835246439529840640 | 8.352460e+17 | 2.625958e+07 | 2017-02-24 21:54:03 +0000 | <a href="http://twitter.com/download/iphone" r... | @jonnysun @Lin_Manuel ok jomny I know you're e... | NaN | NaN | NaN | NaN | 960 | 0 | None | None | None | None | None |
| 340 | 832215909146226688 | NaN | NaN | 2017-02-16 13:11:49 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Logan, the Chow who liv... | 7.867091e+17 | 4.196984e+09 | 2016-10-13 23:23:56 +0000 | https://twitter.com/dog_rates/status/786709082... | 75 | 10 | Logan | None | None | None | None |
... For full data please check my Github page ...
df1[(df1['rating_denominator'] < 10)]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 313 | 835246439529840640 | 8.352460e+17 | 26259576.0 | 2017-02-24 21:54:03 +0000 | <a href="http://twitter.com/download/iphone" r... | @jonnysun @Lin_Manuel ok jomny I know you're e... | NaN | NaN | NaN | NaN | 960 | 0 | None | None | None | None | None |
| 516 | 810984652412424192 | NaN | NaN | 2016-12-19 23:06:23 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Sam. She smiles 24/7 & secretly aspir... | NaN | NaN | NaN | https://www.gofundme.com/sams-smile,https://tw... | 24 | 7 | Sam | None | None | None | None |
| 2335 | 666287406224695296 | NaN | NaN | 2015-11-16 16:11:11 +0000 | <a href="http://twitter.com/download/iphone" r... | This is an Albanian 3 1/2 legged Episcopalian... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666287406... | 1 | 2 | an | None | None | None | None |
pd.options.display.max_colwidth = -1
df1[(df1['name'].str.islower() == True)]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 22 | 887517139158093824 | NaN | NaN | 2017-07-19 03:39:09 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | I've yet to rate a Venezuelan Hover Wiener. This is such an honor. 14/10 paw-inspiring af (IG: roxy.thedoxy) https://t.co/20VrLAA8ba | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887517139158093824/video/1 | 14 | 10 | such | None | None | None | None |
| 56 | 881536004380872706 | NaN | NaN | 2017-07-02 15:32:16 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | Here is a pupper approaching maximum borkdrive. Zooming at never before seen speeds. 14/10 paw-inspiring af \n(IG: puffie_the_chow) https://t.co/ghXBIIeQZF | NaN | NaN | NaN | https://twitter.com/dog_rates/status/881536004380872706/video/1 | 14 | 10 | a | None | None | pupper | None |
| 118 | 869988702071779329 | NaN | NaN | 2017-05-31 18:47:24 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | RT @dog_rates: We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10… | 8.591970e+17 | 4.196984e+09 | 2017-05-02 00:04:57 +0000 | https://twitter.com/dog_rates/status/859196978902773760/video/1 | 12 | 10 | quite | None | None | None | None |
| 169 | 859196978902773760 | NaN | NaN | 2017-05-02 00:04:57 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10 https://t.co/g2nSyGenG9 | NaN | NaN | NaN | https://twitter.com/dog_rates/status/859196978902773760/video/1 | 12 | 10 | quite | None | None | None | None |
| 193 | 855459453768019968 | NaN | NaN | 2017-04-21 16:33:22 +0000 | <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> | Guys, we only rate dogs. This is quite clearly a bulbasaur. Please only send dogs. Thank you... 12/10 human used pet, it's super effective https://t.co/Xc7uj1C64x | NaN | NaN | NaN | https://twitter.com/dog_rates/status/855459453768019968/photo/1,https://twitter.com/dog_rates/status/855459453768019968/photo/1 | 12 | 10 | quite | None | None | None | None |
... For full data please check my Github page ...
109 rows × 17 columns
| rating_numerator | rating_denominator | text | |
|---|---|---|---|
| 342 | 11 | 15 | @docmisterio account started on 11/15/15 |
| 433 | 84 | 70 | The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd |
| 784 | 9 | 11 | RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/… |
| 902 | 165 | 150 | Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE |
| 1068 | 9 | 11 | After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ |
| 1120 | 204 | 170 | Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv |
| 1165 | 4 | 20 | Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a |
| 1202 | 50 | 50 | This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq |
| 1228 | 99 | 90 | Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1 |
| 1254 | 80 | 80 | Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12 |
| 1274 | 45 | 50 | From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK |
| 1351 | 60 | 50 | Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa |
| 1433 | 44 | 40 | Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ |
| 1598 | 4 | 20 | Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating |
| 1634 | 143 | 130 | Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3 |
| 1635 | 121 | 110 | Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55 |
| 1662 | 7 | 11 | This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5 |
| 1663 | 20 | 16 | I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible |
| 1779 | 144 | 120 | IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq |
| 1843 | 88 | 80 | Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw |
df1[df1.name.duplicated()]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 890729181411237888 | NaN | NaN | 2017-07-28 00:22:40 +0000 | <a href="http://twitter.com/download/iphone" r... | When you watch your owner call another dog a g... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/890729181... | 13 | 10 | None | None | None | None | None |
| 12 | 889665388333682689 | NaN | NaN | 2017-07-25 01:55:32 +0000 | <a href="http://twitter.com/download/iphone" r... | Here's a puppo that seems to be on the fence a... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/889665388... | 13 | 10 | None | None | None | None | puppo |
| 23 | 887473957103951883 | NaN | NaN | 2017-07-19 00:47:34 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Canela. She attempted some fancy porch... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
| 24 | 887343217045368832 | NaN | NaN | 2017-07-18 16:08:03 +0000 | <a href="http://twitter.com/download/iphone" r... | You may not have known you needed to see this ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887343217... | 13 | 10 | None | None | None | None | None |
| 25 | 887101392804085760 | NaN | NaN | 2017-07-18 00:07:08 +0000 | <a href="http://twitter.com/download/iphone" r... | This... is a Jubilant Antarctic House Bear. We... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887101392... | 12 | 10 | None | None | None | None | None |
... For full data please check my Github page ...
1399 rows × 17 columns
df1[df1.name.isnull()]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo |
|---|
df1[(df1['text'].str.slice(start=0, stop=2) == 'RT')]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19 | 888202515573088257 | NaN | NaN | 2017-07-21 01:02:36 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Canela. She attempted s... | 8.874740e+17 | 4.196984e+09 | 2017-07-19 00:47:34 +0000 | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
| 32 | 886054160059072513 | NaN | NaN | 2017-07-15 02:45:48 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @Athletics: 12/10 #BATP https://t.co/WxwJmv... | 8.860537e+17 | 1.960740e+07 | 2017-07-15 02:44:07 +0000 | https://twitter.com/dog_rates/status/886053434... | 12 | 10 | None | None | None | None | None |
| 36 | 885311592912609280 | NaN | NaN | 2017-07-13 01:35:06 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Lilly. She just paralle... | 8.305833e+17 | 4.196984e+09 | 2017-02-12 01:04:29 +0000 | https://twitter.com/dog_rates/status/830583320... | 13 | 10 | Lilly | None | None | None | None |
| 68 | 879130579576475649 | NaN | NaN | 2017-06-26 00:13:58 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Emmy. She was adopted t... | 8.780576e+17 | 4.196984e+09 | 2017-06-23 01:10:23 +0000 | https://twitter.com/dog_rates/status/878057613... | 14 | 10 | Emmy | None | None | None | None |
| 73 | 878404777348136964 | NaN | NaN | 2017-06-24 00:09:53 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: Meet Shadow. In an attempt to r... | 8.782815e+17 | 4.196984e+09 | 2017-06-23 16:00:04 +0000 | https://www.gofundme.com/3yd6y1c,https://twitt... | 13 | 10 | Shadow | None | None | None | None |
... For full data please check my Github page ...
183 rows × 17 columns
df1[df1.retweeted_status_id.notnull()]
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19 | 888202515573088257 | NaN | NaN | 2017-07-21 01:02:36 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Canela. She attempted s... | 8.874740e+17 | 4.196984e+09 | 2017-07-19 00:47:34 +0000 | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
| 32 | 886054160059072513 | NaN | NaN | 2017-07-15 02:45:48 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @Athletics: 12/10 #BATP https://t.co/WxwJmv... | 8.860537e+17 | 1.960740e+07 | 2017-07-15 02:44:07 +0000 | https://twitter.com/dog_rates/status/886053434... | 12 | 10 | None | None | None | None | None |
| 36 | 885311592912609280 | NaN | NaN | 2017-07-13 01:35:06 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Lilly. She just paralle... | 8.305833e+17 | 4.196984e+09 | 2017-02-12 01:04:29 +0000 | https://twitter.com/dog_rates/status/830583320... | 13 | 10 | Lilly | None | None | None | None |
| 68 | 879130579576475649 | NaN | NaN | 2017-06-26 00:13:58 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Emmy. She was adopted t... | 8.780576e+17 | 4.196984e+09 | 2017-06-23 01:10:23 +0000 | https://twitter.com/dog_rates/status/878057613... | 14 | 10 | Emmy | None | None | None | None |
| 73 | 878404777348136964 | NaN | NaN | 2017-06-24 00:09:53 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: Meet Shadow. In an attempt to r... | 8.782815e+17 | 4.196984e+09 | 2017-06-23 16:00:04 +0000 | https://www.gofundme.com/3yd6y1c,https://twitt... | 13 | 10 | Shadow | None | None | None | None |
... For full data please check my Github page ...
181 rows × 17 columns
df1_clean['name'].describe()
count 2247 unique 932 top None freq 745 Name: name, dtype: object
pd.options.display.max_colwidth = -1
df1[(df1['source'].str.slice(start=72, stop=-4) != 'iPhone')]
# df1['source'].str.slice(start=72, stop=-4)
# df1['source'].str.slice(start=0, stop=-4)
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 209 | 852226086759018497 | NaN | NaN | 2017-04-12 18:25:07 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | Meet General. He wasn't content with the quality of his room. Requested to pupgrade, but was ignored. 14/10 look who just lost a customer https://t.co/NP5JW8LnmW | NaN | NaN | NaN | https://twitter.com/dog_rates/status/852226086759018497/video/1 | 14 | 10 | General | None | None | None | None |
| 270 | 841314665196081154 | NaN | NaN | 2017-03-13 15:47:01 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | This is Max. There's no way in h*ck you're taking his pacifier. Binky promises it's not happening. 13/10 very good stubborn boy https://t.co/9lVAqDEvZ5 | NaN | NaN | NaN | https://twitter.com/dog_rates/status/841314665196081154/video/1 | 13 | 10 | Max | None | None | None | None |
| 335 | 832645525019123713 | NaN | NaN | 2017-02-17 17:38:57 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | There's going to be a dog terminal at JFK Airport. This is not a drill. 10/10 \nhttps://t.co/dp5h9bCwU7 | NaN | NaN | NaN | http://us.blastingnews.com/news/2017/02/jfk-announces-its-first-ever-ark-oasis-animal-terminal-001480161.html?sbdht=_pM1QUzk3wsdTxcmMoRPV7FWYYlsNKcFRcYSY7OmeHnOXA4NtUM6PLQ2_ | 10 | 10 | not | None | None | None | None |
| 352 | 831315979191906304 | NaN | NaN | 2017-02-14 01:35:49 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | I couldn't make it to the #WKCDogShow BUT I have people there on the ground relaying me the finest pupper pics possible. 13/10 for all https://t.co/jd6lYhfdH4 | NaN | NaN | NaN | https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1,https://twitter.com/dog_rates/status/831315979191906304/photo/1 | 13 | 10 | None | None | None | pupper | None |
| 375 | 828361771580813312 | NaN | NaN | 2017-02-05 21:56:51 +0000 | <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> | Beebop and Doobert should start a band 12/10 would listen | NaN | NaN | NaN | NaN | 12 | 10 | None | None | None | None | None |
... For full data please check my Github page ...
135 rows × 17 columns
df2['tweet_id'].describe()
count 2345 unique 2345 top 778650543019483137 freq 1 Name: tweet_id, dtype: object
Quality¶
image-predictions.tsv
- Some values in P1, P2, and P3 begin with lower case, but some begin with upper case.
- Missing records (2075 vs 2356 records in twitter-archive-enhanced.csv ).?
twitter-archive-enhanced.csv
- Incomplete data - missing value in multiple columns.
- Incorrect values in name (for example, such, an, a ...).
- Missing values are presented by both "NaN" and "None".
- Incorrect values in rating (both rating_numerator and rating_denominator).
- The numbers in retweeted_status_id and retweeted_status_user_id should are also not correctly displayed.
df2
- Missing 11 data records (error "No status found with that ID" when querying twitter API).
- Values are string not integer.
Tidiness¶
twitter-archive-enhanced.csv
- Source contains HTML code.
- Some records are for retweets
Cleaning data¶
df_clean = df.copy()
df1_clean = df1.copy()
df2_clean = df2.copy()
Missing Data¶
image-predictions.tsv¶
- Missing records (2075 vs 2356 records in twitter-archive-enhanced.csv ).
twitter-archive-enhanced.csv¶
- Incomplete data - missing value in multiple columns.
df2 (tweepy query result)¶
- Missing 11 data records (error "No status found with that ID" when querying twitter API).
The missing data are not cleanable¶
Tidiness¶
Some of the records in twitter-archive-enhanced.csv are retweets¶
Define¶
Remove the records that are for retweets in twitter-archive-enhanced.csv.
Code¶
df1_clean = df1_clean.loc[~(df1_clean.retweeted_status_id.notnull())]
Test¶
df1_clean[(df1_clean['text'].str.slice(start=0, stop=2) == 'RT')]
# df1_clean
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1286 | 708400866336894977 | NaN | NaN | 2016-03-11 21:15:02 +0000 | <a href="http://vine.co" rel="nofollow">Vine -... | RT if you are as ready for summer as this pup ... | NaN | NaN | NaN | https://vine.co/v/iHFqnjKVbIQ | 12 | 10 | None | None | None | None | None |
| 1860 | 675489971617296384 | NaN | NaN | 2015-12-12 01:38:53 +0000 | <a href="http://twitter.com/download/iphone" r... | RT until we find this dog. Clearly a cool dog ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/675489971... | 10 | 10 | None | None | None | None | None |
Source in twitter-archive-enhanced.csv includes HTML code, url, and source.¶
Define¶
Extract the source string from HTML code
Code¶
df1_clean = df1_clean.reset_index()
df1_clean['source'] = df1_clean.source.str.extract('([a-zA-Z0-9-]+\b[a-zA-Z0-9-]+|[a-zA-Z0-9-]+)</a>', expand=True)
for i in range(len(df1_clean)):
if df1_clean.loc[i, 'source'] == "Client":
df1_clean.loc[i, 'source'] = "Twitter Web Client"
if df1_clean.loc[i, 'source'] == "Scene":
df1_clean.loc[i, 'source'] = "Vine"
Test¶
df1_clean.source.value_counts()
iPhone 2042 Vine 91 Twitter Web Client 31 TweetDeck 11 Name: source, dtype: int64
Quality¶
Mixed lower and upper case in p1/p2/p3 columns of image-predictions.tsv¶
Define¶
Change values in p1/p2/p3 to beign with upper case and followed by lower case letters.
Code¶
df_col = ['p1', 'p2', 'p3']
for i in range(3):
df_clean[df_col[i]] = df_clean[df_col[i]].str.capitalize()
Test¶
df_clean[['p1', 'p2', 'p3']]
| p1 | p2 | p3 | |
|---|---|---|---|
| 0 | Welsh_springer_spaniel | Collie | Shetland_sheepdog |
| 1 | Redbone | Miniature_pinscher | Rhodesian_ridgeback |
| 2 | German_shepherd | Malinois | Bloodhound |
| 3 | Rhodesian_ridgeback | Redbone | Miniature_pinscher |
| 4 | Miniature_pinscher | Rottweiler | Doberman |
| 5 | Bernese_mountain_dog | English_springer | Greater_swiss_mountain_dog |
| 6 | Box_turtle | Mud_turtle | Terrapin |
| 7 | Chow | Tibetan_mastiff | Fur_coat |
| 8 | Shopping_cart | Shopping_basket | Golden_retriever |
| 9 | Miniature_poodle | Komondor | Soft-coated_wheaten_terrier |
| 10 | Golden_retriever | Tibetan_mastiff | Labrador_retriever |
... For full data please check my Github page ...
2075 rows × 3 columns
Inaccurate names in twitter-archive-enhanced.csv¶
Define¶
Extract the name from text column and replace with the wrong name in the name column.
Code¶
regex_name2 = 'named ([A-Z][a-z]+)|is ([A-Z][a-z]+)'
r_name2 = re.compile(regex_name2)
for i in range(len(df1_clean)):
if df1_clean.loc[i, 'name'].islower():
df1_text = df1_clean.loc[i, 'text']
df1_name = r_name2.findall(df1_text)
if len(df1_name) == 1:
if df1_name[0][0]:
df1_clean.at[i, 'name'] = df1_name[0][0]
print(df1_name)
if df1_name[0][1]:
df1_clean.at[i, 'name'] = df1_name[0][1]
print(df1_name)
if len(df1_name) == 0:
df1_clean.at[i, 'name'] = "None"
[('', 'Freudian')]
[('', 'Arctic')]
[('', 'Zoey')]
[('', 'Quizno')]
[('', 'Jamaican')]
[('', 'Alaskan')]
[('', 'Bulgarian')]
[('', 'Dutch')]
[('', 'Mongolian')]
[('Wylie', '')]
[('Kip', '')]
[('Jacob', '')]
[('Rufus', '')]
[('Spork', '')]
[('Cherokee', '')]
[('Hemry', '')]
[('Alphred', '')]
[('Alfredo', '')]
[('Leroi', '')]
[('Berta', '')]
[('Chuk', '')]
[('Alfonso', '')]
[('Cheryl', '')]
[('Jessiga', '')]
[('Klint', '')]
[('Kohl', '')]
[('', 'Daryl')]
[('Pepe', '')]
[('Octaviath', '')]
[('Johm', '')]
Test¶
df1_clean[(df1_clean['tweet_id'] == 666701168228331520)]
| index | tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2133 | 2314 | 666701168228331520 | NaN | NaN | 2015-11-17 19:35:19 +0000 | iPhone | This is a golden Buckminsterfullerene named Jo... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666701168... | 8 | 10 | Johm | None | None | None | None |
df1_clean[(df1_clean['tweet_id'] == 748692773788876800)]
| index | tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 817 | 992 | 748692773788876800 | NaN | NaN | 2016-07-01 01:40:41 +0000 | iPhone | That is Quizno. This is his beach. He does not... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/748692773... | 10 | 10 | Quizno | doggo | None | None | None |
df1_clean.sample(200)
| index | tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1008 | 1186 | 718540630683709445 | NaN | NaN | 2016-04-08 20:46:50 +0000 | iPhone | Get you a pup that can do both. 10/10 https://... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/718540630... | 10 | 10 | None | None | None | None | None |
| 951 | 1129 | 729463711119904772 | NaN | NaN | 2016-05-09 00:11:16 +0000 | iPhone | Meet Pupcasso. You can't afford his art. 13/10... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/729463711... | 13 | 10 | Pupcasso | None | None | None | None |
| 1964 | 2143 | 669970042633789440 | NaN | NaN | 2015-11-26 20:04:40 +0000 | iPhone | This is Julio. He was one of the original Ring... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/669970042... | 10 | 10 | Julio | None | None | None | None |
| 803 | 978 | 749996283729883136 | NaN | NaN | 2016-07-04 16:00:22 +0000 | TweetDeck | This is Bo. He emanates happiness. 12/10 I cou... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/749996283... | 12 | 10 | Bo | None | None | None | None |
| 1494 | 1673 | 682303737705140231 | NaN | NaN | 2015-12-30 20:54:22 +0000 | iPhone | This is Todo. He's screaming because he doesn'... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/682303737... | 9 | 10 | Todo | None | None | None | None |
... For full data please check my Github page ...
200 rows × 18 columns
Missing values are presented by both "NaN" and "None".¶
Define¶
Replace the "None" in twitter-archive-enhanced.csv with "NaN".
Code¶
df1_clean.replace("None", np.nan, inplace=True)
Test¶
df1_clean.sample(200)
| index | tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 494 | 614 | 796759840936919040 | NaN | NaN | 2016-11-10 17:02:03 +0000 | iPhone | Say hello to Romeo. He was just told that it's... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/796759840... | 11 | 10 | Romeo | NaN | NaN | NaN | NaN |
| 2151 | 2332 | 666345417576210432 | NaN | NaN | 2015-11-16 20:01:42 +0000 | iPhone | Look at this jokester thinking seat belt laws ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666345417... | 10 | 10 | NaN | NaN | NaN | NaN | NaN |
| 1444 | 1623 | 684902183876321280 | NaN | NaN | 2016-01-07 00:59:40 +0000 | iPhone | This is Perry. He's an Augustus Gloopster. Ver... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/684902183... | 11 | 10 | Perry | NaN | NaN | NaN | NaN |
| 1524 | 1703 | 680913438424612864 | NaN | NaN | 2015-12-27 00:49:49 +0000 | iPhone | Meet Griswold. He's dapper as hell. Already pu... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/680913438... | 11 | 10 | Griswold | NaN | NaN | NaN | NaN |
| 1782 | 1961 | 673359818736984064 | NaN | NaN | 2015-12-06 04:34:25 +0000 | iPhone | This is Steve. He was just relaxing in hot tub... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/673359818... | 8 | 10 | Steve | NaN | NaN | NaN | NaN |
... For full data please check my Github page ...
200 rows × 18 columns
Incorrect values in rating (both rating_numerator and rating_denominator).¶
Define¶
List all the ratings with denominator not equal to 10, and correct the wrong ratings. Remove those records that doesn't have a rating.
Code¶
pd.options.display.max_colwidth = -1
df1_clean[['rating_numerator', 'rating_denominator', 'text' ]][(df1_clean['rating_denominator'] != 10)]
| rating_numerator | rating_denominator | text | |
|---|---|---|---|
| 263 | 960 | 0 | @jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho |
| 287 | 11 | 15 | @docmisterio account started on 11/15/15 |
| 363 | 84 | 70 | The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd |
| 429 | 24 | 7 | Meet Sam. She smiles 24/7 & secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx |
| 733 | 165 | 150 | Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE |
| 890 | 9 | 11 | After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ |
| 942 | 204 | 170 | Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv |
| 987 | 4 | 20 | Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a |
| 1024 | 50 | 50 | This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq |
| 1050 | 99 | 90 | Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1 |
| 1075 | 80 | 80 | Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12 |
| 1095 | 45 | 50 | From left to right:\nCletus, Jerome, Alejandro, Burp, & Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK |
| 1172 | 60 | 50 | Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa |
| 1254 | 44 | 40 | Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ |
| 1419 | 4 | 20 | Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating |
| 1455 | 143 | 130 | Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3 |
| 1456 | 121 | 110 | Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55 |
| 1483 | 7 | 11 | This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5 |
| 1484 | 20 | 16 | I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible |
| 1600 | 144 | 120 | IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq |
| 1664 | 88 | 80 | Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw |
| 2154 | 1 | 2 | This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv |
df1_clean.loc[263, 'rating_numerator'] = 13
df1_clean.loc[263, 'rating_denominator'] = 10
#set both numerator and denominator to 0 if there is no rating in the text.
df1_clean.loc[287, 'rating_numerator'] = 0
df1_clean.loc[287, 'rating_denominator'] = 0
df1_clean.loc[429, 'rating_numerator'] = 0
df1_clean.loc[429, 'rating_denominator'] = 0
df1_clean.loc[890, 'rating_numerator'] = 14
df1_clean.loc[890, 'rating_denominator'] = 10
df1_clean.loc[987, 'rating_numerator'] = 13
df1_clean.loc[987, 'rating_denominator'] = 10
df1_clean.loc[1483, 'rating_numerator'] = 10
df1_clean.loc[1483, 'rating_denominator'] = 10
df1_clean.loc[2154, 'rating_numerator'] = 9
df1_clean.loc[2154, 'rating_denominator'] = 10
#remove those records that doesn't have a rating.
df1_clean = df1_clean.loc[(df1_clean['rating_denominator'] != 0)]
Test¶
df1_clean.rating_denominator.value_counts()
10 2158 50 3 80 2 170 1 150 1 130 1 120 1 110 1 90 1 70 1 40 1 20 1 16 1 Name: rating_denominator, dtype: int64
The numbers in retweeted_status_id and retweeted_status_user_id should are also not correctly displayed.¶
This has been solved by cleaning tidiness issue 1 removing all the retweet data, but I would like to remove reweeted status and in reply to status columns here to keep the master dataframe slim¶
Define¶
Remove in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, and retweeted_status_timestamp columns
Code¶
df1_clean = df1_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis = 1)
Test¶
df1_clean
| index | tweet_id | timestamp | source | text | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 892420643555336193 | 2017-08-01 16:23:56 +0000 | iPhone | This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU | https://twitter.com/dog_rates/status/892420643555336193/photo/1 | 13 | 10 | Phineas | NaN | NaN | NaN | NaN |
| 1 | 1 | 892177421306343426 | 2017-08-01 00:17:27 +0000 | iPhone | This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV | https://twitter.com/dog_rates/status/892177421306343426/photo/1 | 13 | 10 | Tilly | NaN | NaN | NaN | NaN |
| 2 | 2 | 891815181378084864 | 2017-07-31 00:18:03 +0000 | iPhone | This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB | https://twitter.com/dog_rates/status/891815181378084864/photo/1 | 12 | 10 | Archie | NaN | NaN | NaN | NaN |
| 3 | 3 | 891689557279858688 | 2017-07-30 15:58:51 +0000 | iPhone | This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ | https://twitter.com/dog_rates/status/891689557279858688/photo/1 | 13 | 10 | Darla | NaN | NaN | NaN | NaN |
| 4 | 4 | 891327558926688256 | 2017-07-29 16:00:24 +0000 | iPhone | This is Franklin. He would like you to stop calling him "cute." He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f | https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1 | 12 | 10 | Franklin | NaN | NaN | NaN | NaN |
... For full data please check my Github page ...
2173 rows × 13 columns
Values in df2 are string not numbers¶
Define¶
Convert the columns in df2 to int
Code¶
df2_clean['tweet_id'] = df2_clean['tweet_id'].astype(int)
df2_clean['retweet_count'] = df2_clean['retweet_count'].astype(int)
df2_clean['favorite_count'] = df2_clean['favorite_count'].astype(int)
Test¶
df2_clean['tweet_id'].describe()
count 2.345000e+03 mean 7.422940e+17 std 6.833642e+16 min 6.660209e+17 25% 6.783802e+17 50% 7.189392e+17 75% 7.986979e+17 max 8.924206e+17 Name: tweet_id, dtype: float64
Merging all three dataframes¶
df_master_clean = pd.merge(df_clean, df1_clean, on=['tweet_id'])
df_master_clean = pd.merge(df_master_clean, df2_clean, on=['tweet_id'])
df_master_clean
| tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | ... | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | retweet_count | favorite_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | Collie | 0.156665 | True | Shetland_sheepdog | ... | https://twitter.com/dog_rates/status/666020888022790149/photo/1 | 8 | 10 | NaN | NaN | NaN | NaN | NaN | 521 | 2560 |
| 1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | Redbone | 0.506826 | True | Miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | ... | https://twitter.com/dog_rates/status/666029285002620928/photo/1 | 7 | 10 | NaN | NaN | NaN | NaN | NaN | 47 | 131 |
| 2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | Malinois | 0.138584 | True | Bloodhound | ... | https://twitter.com/dog_rates/status/666033412701032449/photo/1 | 9 | 10 | NaN | NaN | NaN | NaN | NaN | 45 | 125 |
| 3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | Redbone | 0.360687 | True | Miniature_pinscher | ... | https://twitter.com/dog_rates/status/666044226329800704/photo/1 | 6 | 10 | NaN | NaN | NaN | NaN | NaN | 141 | 302 |
| 4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | Miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | ... | https://twitter.com/dog_rates/status/666049248165822465/photo/1 | 5 | 10 | NaN | NaN | NaN | NaN | NaN | 40 | 109 |
| 5 | 666050758794694657 | https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg | 1 | Bernese_mountain_dog | 0.651137 | True | English_springer | 0.263788 | True | Greater_swiss_mountain_dog | ... | https://twitter.com/dog_rates/status/666050758794694657/photo/1 | 10 | 10 | NaN | NaN | NaN | NaN | NaN | 58 | 133 |
... For full data please check my Github page ...
1993 rows × 26 columns
Exporting master dataframe to CSV file¶
df_master_clean.to_csv('twitter_archive_master.csv')
Analyzing and Visualizing Data¶
df_master = pd.read_csv('twitter_archive_master.csv')
The breeds of dog ranked in the top 20 favorited tweets.¶
#The breeds of dog in the top 20 favorited tweets.
df_top_fav = df_master.sort_values('favorite_count', ascending=False).head(20)[['p1', 'p1_dog', 'favorite_count']]
df_top_fav
| p1 | p1_dog | favorite_count | |
|---|---|---|---|
| 1683 | Lakeland_terrier | True | 143952 |
| 1218 | Labrador_retriever | True | 128764 |
| 1884 | French_bulldog | True | 124651 |
| 1593 | Chihuahua | True | 123459 |
| 1934 | English_springer | True | 106354 |
| 1659 | Standard_poodle | True | 94025 |
| 1857 | Angora | False | 92951 |
| 1899 | Golden_retriever | True | 83699 |
| 1591 | Arabian_camel | False | 82538 |
| 1927 | Chesapeake_bay_retriever | True | 80383 |
| 511 | Bubble | False | 79048 |
| 1959 | Italian_greyhound | True | 77876 |
| 1868 | Chow | True | 76730 |
| 1183 | Eskimo_dog | True | 73658 |
| 1727 | Labrador_retriever | True | 72236 |
| 1970 | Pembroke | True | 69403 |
| 1892 | Laptop | False | 66664 |
| 1985 | Pomeranian | True | 65817 |
| 569 | Swing | False | 61243 |
| 1720 | Boxer | True | 57279 |
The breeds of dog ranked in the top 20 retweeted tweets.¶
#The breeds of dog in the top 20 retweeted tweets.
df_top_ret = df_master.sort_values('retweet_count', ascending=False).head(20)[['p1', 'p1_dog', 'retweet_count']]
df_top_ret
| p1 | p1_dog | retweet_count | |
|---|---|---|---|
| 1218 | Labrador_retriever | True | 77716 |
| 1593 | Chihuahua | True | 61444 |
| 1183 | Eskimo_dog | True | 51148 |
| 1683 | Lakeland_terrier | True | 49407 |
| 1934 | English_springer | True | 44828 |
... For full data please contact me ...
The top rated dogs and their tweet info¶
df_top_rated = df_master[['p1', 'p1_dog', 'rating']][(df_master['rating'] >= 1.4)].sort_values('rating', ascending=False)
df_top_rated
| p1 | p1_dog | rating | |
|---|---|---|---|
| 1267 | Bow_tie | False | 177.6 |
| 275 | Microphone | False | 42.0 |
| 1490 | Pomeranian | True | 7.5 |
| 1440 | Clumber | True | 2.7 |
| 619 | Kuvasz | True | 2.6 |
... For full data please contact me ...
Visualization¶
The most popular breeds of dog¶
df_top_dogs = df_top_fav.append(df_top_ret).append(df_top_rated)
df_top_dogs = df_top_dogs[(df_top_dogs['p1_dog'] == True)]
df_top_dogs
| favorite_count | p1 | p1_dog | rating | retweet_count | |
|---|---|---|---|---|---|
| 1683 | 143952.0 | Lakeland_terrier | True | NaN | NaN |
| 1218 | 128764.0 | Labrador_retriever | True | NaN | NaN |
| 1884 | 124651.0 | French_bulldog | True | NaN | NaN |
| 1593 | 123459.0 | Chihuahua | True | NaN | NaN |
| 1934 | 106354.0 | English_springer | True | NaN | NaN |
... For full data please contact me ...
df_top_dogs.p1.value_counts().sort_values(ascending=True).plot(kind='barh', figsize=(8, 8), rot=0)
# plt.axis('off')
# plt.axis('equal')
plt.title('The most popular breed found')
Text(0.5,1,'The most popular breed found')
#This part is not in the report
df_master.source.value_counts()
iPhone 1954 Twitter Web Client 28 TweetDeck 11 Name: source, dtype: int64
df_master.source.value_counts().plot(kind='pie', figsize=(8, 8), autopct='%.2f')
plt.axis('off')
plt.axis('equal')
plt.title('Source pie chart')
Text(0.5,1,'Source pie chart')