visit
Similar to one of my earlier projects, we will use Jupyter Notebook for this one. It’s a great tool for experimenting and working with data.
{
"ts": "2023-01-30T16:36:40Z",
"username": "",
"platform": "linux",
"ms_played": 239538,
"conn_country": "DE",
"ip_addr_decrypted": "",
"user_agent_decrypted": "",
"master_metadata_track_name": "Wonderwall - Remastered",
"master_metadata_album_artist_name": "Oasis",
"master_metadata_album_album_name": "(What's The Story) Morning Glory? (Deluxe Remastered Edition)",
"spotify_track_uri": "spotify:track:7ygpwy2qP3NbrxVkHvUhXY",
"episode_name": null,
"episode_show_name": null,
"spotify_episode_uri": null,
"reason_start": "remote",
"reason_end": "remote",
"shuffle": false,
"skipped": false,
"offline": false,
"offline_timestamp": 0,
"incognito_mode": false
}
This allows you not only to figure out when and on which device you listened to a song but also gives you information such as if and when you skipped it.
path_to_json = 'my_spotify_data/'
frames = []
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
frames.append(pd.read_json(path_to_json + file_name))
df = pd.concat(frames)
# drop all rows containing podcasts
df = df[df['spotify_track_uri'].notna()]
# drop all songs which were playing less than 15 seconds
df = df[df['ms_played'] > 15000]
# convert ts from string to datetime
df['ts'] = pd.to_datetime(df['ts'], utc=False)
df['date'] = df['ts'].dt.date
# drop all columns which are not needed
columns_to_keep = [
'ts',
'date',
'ms_played',
'platform',
'conn_country',
'master_metadata_track_name',
'master_metadata_album_artist_name',
'master_metadata_album_album_name',
'spotify_track_uri'
]
df = df[columns_to_keep]
df = df.sort_values(by=['ts'])
songs_df = df.copy()
df = songs_df.copy()
df = df.groupby(['spotify_track_uri']).size().reset_index().rename(columns={0: 'count'})
df = df.sort_values(by=['count'], ascending=False).reset_index()
df = df.merge(songs_df.drop_duplicates(subset='spotify_track_uri'))
df = df[['master_metadata_track_name', 'master_metadata_album_artist_name', 'master_metadata_album_album_name', 'count']]
df.head(20)
def top_songs_in_year(year):
df = songs_df.copy()
df['year'] = df['ts'].dt.year
df = df.loc[(df['year'] == year)]
print(f"Time listened in {year}: {datetime.timedelta(milliseconds=int(df['ms_played'].sum()))}")
df = df.groupby(['spotify_track_uri']).size().reset_index().rename(columns={0: 'count'})
df = df.sort_values(by=['count'], ascending=False).reset_index()
df = df.merge(songs_df.drop_duplicates(subset='spotify_track_uri'))
df = df[['master_metadata_track_name',
'master_metadata_album_artist_name',
'master_metadata_album_album_name',
'count']]
return df.head(20)
@interact
def top_songs(date_range=date_range_slider):
df = songs_df.copy()
time_range_start = pd.Timestamp(date_range[0])
time_range_end = pd.Timestamp(date_range[1])
df = df.loc[(df['date'] >= time_range_start.date())
& (df['date'] <= time_range_end.date())]
df = df.groupby(['spotify_track_uri']).size().reset_index().rename(columns={0: 'count'})
df = df.sort_values(by=['count'], ascending=False).reset_index()
df = df.merge(songs_df.drop_duplicates(subset='spotify_track_uri'))
df = df[['master_metadata_track_name',
'master_metadata_album_artist_name',
'master_metadata_album_album_name',
'count']]
return df.head(20)
def plot_weekday_distribution():
df = songs_df.copy()
df['year'] = df['ts'].dt.year
df['weekday'] = df['ts'].dt.weekday
df = df.groupby(['year', 'weekday']).size().reset_index(name='count')
fig, ax = plt.subplots(figsize=(12, 8))
for year, data in df.groupby('year'):
ax.plot(data['weekday'], data['count'], label=str(year))
weekdays_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.xticks(range(7), weekdays_order)
plt.title('Weekday Distribution of Played Tracks Over Years')
plt.xlabel('Weekday')
plt.ylabel('Number of Played Tracks')
plt.legend(title='Year')
plt.show()