import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt


# load tiktok data into dataframe
tiktok_data = pd.read_csv('tiktok.csv')

# drop unnecessary columns
tiktok_data = tiktok_data.drop(['Unnamed: 0','track_id','artist_id','album_id',
                                'playlist_id','genre','playlist_name','duration_mins'], axis = 1)
tiktok_data.head()


# load spotify data into dataframe
spotify_data = pd.read_csv('spotify_top50_2021.csv')

# drop unnecessary columns
spotify_data = spotify_data.drop(['id','track_id'], axis = 1)

spotify_data.head()


# update dataframe with new column 'count'
tiktok_data['count'] = tiktok_data['track_name'].map(tiktok_data['track_name'].value_counts())

# sort songs by most used to least used and remove duplicate entries
tiktok_data = tiktok_data.sort_values(by=['count'], ascending=False)
tiktok_data = tiktok_data.drop_duplicates(subset = "track_name")

# drop the rows that are not 50 most used
tiktok_data = tiktok_data.drop(tiktok_data.index[50:])
tiktok_data = tiktok_data.reset_index(drop=True)
tiktok_data.head()


# bar graph of each song in top 50 and its count
fig, ax = plt.subplots()
plt.title("Top 50 Songs on Tiktok", fontsize=16)
X = tiktok_data["count"]
Y = tiktok_data["track_name"]
fig = sns.barplot(y = Y, x = X)
matplotlib.rcParams['figure.figsize'] = [20, 25]

# a line to indicate the mean count
avg = tiktok_data["count"].mean()
ax.axvline(avg, color="black", linewidth=2);


# creating tiktok heatmap
sns.set_theme(rc = {'figure.figsize':(13,13)})
c_map = sns.diverging_palette(220, 20, as_cmap=True)
sns.heatmap(tiktok_data.corr(), cmap=c_map, annot=True)
plt.show()


# creating spotify heatmap
sns.set_theme(rc = {'figure.figsize':(13,13)})
c_map = sns.diverging_palette(220, 20, as_cmap=True)
sns.heatmap(spotify_data.corr(), cmap=c_map, annot=True)
plt.show()


# plot histogram for TikTok and Spotify: energy
fig, ax = plt.subplots();

# bars for energy ᕦ(ò_óˇ)
a_heights, a_bins = np.histogram(tiktok_data['energy']);
b_heights, b_bins = np.histogram(spotify_data['energy'], bins=a_bins);

width = (a_bins[1] - a_bins[0])/3

# prettify the graph
ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue', label='TikTok');
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='plum', label='Spotify');
fig.suptitle('Energy', fontsize=20);
plt.ylabel('Number of Songs', fontsize=16);
plt.xlabel('Level of Energy', fontsize=16);
ax.legend();


# plot histogram for TikTok and Spotify: loudness
fig, ax = plt.subplots();

# bars for loud
a_heights, a_bins = np.histogram(tiktok_data['loudness']);
b_heights, b_bins = np.histogram(spotify_data['loudness'], bins=a_bins);

width = (a_bins[1] - a_bins[0])/3

# prettify the graph
ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue', label='TikTok');
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='plum', label='Spotify');
fig.suptitle('Loudness', fontsize=20);
plt.ylabel('Number of Songs', fontsize=16);
plt.xlabel('Level of Loudness', fontsize=16);
ax.legend();


# plot points and line for Tiktok
x = tiktok_data['loudness']
y = tiktok_data['energy']
plt.scatter(x, y, c='b', label='TikTok')
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x+b, c='b')
eq1 = "TikTok Line of Best Fit: y[loudness] = "+str(m)+" * x[energy] + "+str(b)
 
# plot points and line for Spotify
x = spotify_data['loudness']
y = spotify_data['energy']
plt.scatter(x, y, c='purple', label='Spotify')
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x+b, c='purple')
eq2 = "Spotify Line of Best Fit: y[loudness] = "+str(m)+" * x[energy] + "+str(b)

# set titles and labels
plt.title('Loudness vs. Energy', fontsize=16);
plt.ylabel('Loudneses', fontsize=16);
plt.xlabel('Energy', fontsize=16);
plt.legend(loc='upper left')
plt.show()


print(eq1)
print(eq2)

TikTok Line of Best Fit: y[loudness] = 0.03922673705836067 * x[energy] + 0.8775505849346418
Spotify Line of Best Fit: y[loudness] = 0.054131218234488826 * x[energy] + 0.9659463819589633


# plot histogram for TikTok and Spotify: danceability ヽ(⌐■_■)ノ♪♬
fig, ax = plt.subplots();

# bars for dance
a_heights, a_bins = np.histogram(tiktok_data['danceability']);
b_heights, b_bins = np.histogram(spotify_data['danceability'], bins=a_bins);

width = (a_bins[1] - a_bins[0])/3

# prettify the graph
ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue', label='TikTok');
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='plum', label='Spotify');
fig.suptitle('Danceability', fontsize=20);
plt.ylabel('Number of Songs', fontsize=16);
plt.xlabel('Level of Danceability', fontsize=16);
ax.legend();


# plot histogram for TikTok and Spotify: acousticness
fig, ax = plt.subplots();

# bars for acousticness
a_heights, a_bins = np.histogram(tiktok_data['acousticness']);
b_heights, b_bins = np.histogram(spotify_data['acousticness'], bins=a_bins);

width = (a_bins[1] - a_bins[0])/3

# prettify the graph
ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue', label='TikTok');
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='plum', label='Spotify');
fig.suptitle('Acousticness', fontsize=20);
plt.ylabel('Number of Songs', fontsize=16);
plt.xlabel('Level of Acousticness', fontsize=16);
ax.legend();


# creating density plots
tiktok_data["tempo"].plot.kde(bw_method=0.15, c = 'blue', label = 'TikTok');
spotify_data["tempo"].plot.kde(bw_method=0.15, c = 'purple', label = 'Spotify');

# labels and title
plt.title('Tempo', fontsize=16);
plt.ylabel('Density', fontsize=16);
plt.xlabel('Tempo (Beats per Minute)', fontsize=16);
plt.legend(loc='upper left');


# print stats
print("TikTok's Summary Statistics on Tempo:\n",tiktok_data["tempo"].describe())
print("Spotify's Summary Statistics on Tempo:\n",spotify_data["tempo"].describe())

TikTok's Summary Statistics on Tempo:
 count     50.000000
mean     119.439340
std       22.949968
min       71.994000
25%      100.787750
50%      119.934500
75%      132.315000
max      171.020000
Name: tempo, dtype: float64
Spotify's Summary Statistics on Tempo:
 count     50.000000
mean     121.083860
std       29.252206
min       72.017000
25%       98.655500
50%      120.516500
75%      138.532000
max      180.917000
Name: tempo, dtype: float64


# print stats for time_signature (Spotify)
print("Spotify's Summary Statistics on Time Signature:\n",spotify_data["time_signature"].describe())

Spotify's Summary Statistics on Time Signature:
 count    50.000000
mean      3.960000
std       0.197949
min       3.000000
25%       4.000000
50%       4.000000
75%       4.000000
max       4.000000
Name: time_signature, dtype: float64


# artists with most songs in TikTok
tiktok_data['Count']=1
tiktok_artist = tiktok_data.groupby('artist_name')['Count'].sum().reset_index().sort_values(by='Count',ascending=False)
tiktok_top_ten = tiktok_artist.head(10)

# artists with most songs in Spotify
spotify_data['Count']=1
spotify_artist = spotify_data.groupby('artist_name')['Count'].sum().reset_index().sort_values(by='Count',ascending=False)
spotify_top_ten = spotify_artist.head(10)
print("Left: TikTok | Right: Spotify")
pd.concat([d.reset_index(drop=True) for d in [tiktok_top_ten, spotify_top_ten]], axis=1)

Left: TikTok | Right: Spotify


# creating a dataframe that has common songs
common_songs = pd.merge(tiktok_data, spotify_data, on='track_name', how='inner')

# drop unnecessary columns
common_songs = common_songs[['track_name','artist_name_x']]

display(common_songs)
print(str(8/50*100), "% songs in common")

16.0 % songs in common

	track_name	artist_name	duration	release_date	popularity	danceability	energy	key	loudness	mode	speechiness	acousticness	liveness	valence	tempo
0	Lay It Down Gmix - Main	Lloyd	302186	2011-01-01	28	0.597	0.800	1	-5.423	0	0.3120	0.0461	0.1800	0.565	155.932
1	Bartender (feat. Akon)	T-Pain	238800	2007-06-05	75	0.832	0.391	8	-8.504	1	0.0628	0.0564	0.2240	0.436	104.961
2	Bartender (feat. Akon)	T-Pain	238800	2007-06-05	75	0.832	0.391	8	-8.504	1	0.0628	0.0564	0.2240	0.436	104.961
3	Chosen (feat. Ty Dolla $ign)	Blxst	161684	2020-12-04	76	0.571	0.767	2	-5.160	1	0.2870	0.3360	0.0809	0.605	93.421
4	Tie Me Down (with Elley Duhé)	Gryffin	218295	2018-08-03	72	0.548	0.839	6	-2.371	1	0.0644	0.1350	0.1020	0.314	98.932

	artist_name	track_name	popularity	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	time_signature
0	Olivia Rodrigo	drivers license	92	0.561	0.431	10	-8.810	1	0.0578	0.76800	0.000014	0.1060	0.137	143.875	242013	4
1	Lil Nas X	MONTERO (Call Me By Your Name)	90	0.593	0.503	8	-6.725	0	0.2200	0.29300	0.000000	0.4050	0.710	178.781	137704	4
2	The Kid LAROI	STAY (with Justin Bieber)	92	0.591	0.764	1	-5.484	1	0.0483	0.03830	0.000000	0.1030	0.478	169.928	141806	4
3	Olivia Rodrigo	good 4 u	95	0.563	0.664	9	-5.044	1	0.1540	0.33500	0.000000	0.0849	0.688	166.928	178147	4
4	Dua Lipa	Levitating (feat. DaBaby)	89	0.702	0.825	6	-3.787	0	0.0601	0.00883	0.000000	0.0674	0.915	102.977	203064	4

	track_name	artist_name	duration	release_date	popularity	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	count
0	Don't Start Now	Dua Lipa	183290	2019-10-31	85	0.794	0.793	11	-4.521	0	0.0842	0.0125	0.000000	0.0952	0.677	123.941	26
1	What You Know Bout Love	Pop Smoke	160000	2020-07-03	87	0.709	0.548	10	-8.493	1	0.3530	0.6500	0.000002	0.1330	0.543	83.995	24
2	OUT WEST (feat. Young Thug)	JACKBOYS	157712	2019-12-27	83	0.802	0.591	8	-4.895	1	0.2250	0.0104	0.000000	0.1960	0.309	139.864	23
3	drivers license	Olivia Rodrigo	242013	2021-01-08	94	0.585	0.436	10	-8.761	1	0.0601	0.7210	0.000013	0.1050	0.132	143.874	23
4	No Idea	Don Toliver	154424	2019-05-29	5	0.651	0.631	6	-5.717	0	0.0896	0.5190	0.000579	0.1650	0.350	127.994	23

	artist_name	Count	artist_name	Count
0	Megan Thee Stallion	2	Doja Cat	4
1	Cardi B	2	Olivia Rodrigo	4
2	Doja Cat	2	Bad Bunny	3
3	Pop Smoke	2	Lil Nas X	2
4	24kGoldn	1	BTS	2
5	Ritt Momney	1	The Weeknd	2
6	Lil Vinceyy	1	Dua Lipa	2
7	Mike Posner	1	The Kid LAROI	2
8	Monte Booker	1	Ariana Grande	2
9	Nelly Furtado	1	Måneskin	2

	track_name	artist_name_x
0	Don't Start Now	Dua Lipa
1	drivers license	Olivia Rodrigo
2	Mood (feat. iann dior)	24kGoldn
3	Peaches (feat. Daniel Caesar & Giveon)	Justin Bieber
4	34+35	Ariana Grande
5	Heartbreak Anniversary	Giveon
6	Blinding Lights	DJ Challenge X
7	Kiss Me More (feat. SZA)	Doja Cat

A Musical Analysis on Generation Z (aka TikTok)¶

Judy Song¶

Introduction¶

Data Collection & Management

TikTok Dataset¶

Spotify Dataset¶

Data Reorganization¶

Dataset Info¶

Data Exploration and Visualization¶

TikTok Data: Revisualization¶

To Correlate or Not to Correlate?¶

Tiktok vs. Spotify¶

Does Time Matter?¶

Popularity Contest¶

Trendsetter Gen Z?¶

Conclusion¶