Skip to content

Commit

Permalink
Use LLMs to evaluate movies (#49)
Browse files Browse the repository at this point in the history
* Use LLMs to evaluate movies

* Adding additional keys

* Updating changelog

* Updating packages

* Bug fixes to array filters
  • Loading branch information
sjlu authored Dec 30, 2024
1 parent 38d9289 commit a3ea378
Show file tree
Hide file tree
Showing 11 changed files with 3,019 additions and 3,898 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/upload_s3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ jobs:
key: static-cache-${{ env.DATE }}
restore-keys: |
static-cache-
- name: Install Node.js 16.x
- name: Install Node.js 20.x
uses: actions/setup-node@v3
with:
node-version: '16.x'
node-version: '20.x'
cache: 'npm'
- run: npm install
- name: test
Expand All @@ -45,6 +45,7 @@ jobs:
CLOUDFLARE_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_ACCESS_KEY_ID }}
CLOUDFLARE_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_SECRET_ACCESS_KEY }}
CLOUDFLARE_BUCKET: ${{ secrets.CLOUDFLARE_BUCKET }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
- name: Store
uses: actions/upload-artifact@v3
with:
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 2024-12-30

* MAJOR: Using Anthropic Claude 3.5 Sonnet for evaluation on main recommendation file. The original file is now `all-movies.json`.
* Swapped out Metacritic in favor of TMDB for the main source of movies.
* Fixed an issue where Metacritic ratings were not being pulled correctly from OMDB.

## 2023-09-11

* Moving to Cloudflare R2 for hosting.
Expand Down
13 changes: 11 additions & 2 deletions bin/deploy.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ const build = function (listBuilder, filename, opts = {}) {
opts
})
.then(function () {
if (this.opts.evaluate === true) {
return this.listBuilder.evaluate()
}

return this.listBuilder.filter(this.opts)
})
.then(function (movies) {
Expand All @@ -40,10 +44,15 @@ Promise
.then(function () {
return [
{
filename: 'movies.json'
filename: 'movies.json',
evaluate: true
},
{
filename: `movies-${moment().format('YYYYMMDD')}.json`,
evaluate: true
},
{
filename: `movies-${moment().format('YYYYMMDD')}.json`
filename: 'all-movies.json'
},
{
filename: 'movies-metacritic-min50.json',
Expand Down
3 changes: 2 additions & 1 deletion config.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ const config = {
CLOUDFLARE_ACCOUNT_ID: '',
CLOUDFLARE_ACCESS_KEY_ID: '',
CLOUDFLARE_SECRET_ACCESS_KEY: '',
CLOUDFLARE_BUCKET: ''
CLOUDFLARE_BUCKET: '',
ANTHROPIC_API_KEY: ''
}

module.exports = _.pick(_.assign({}, config, process.env), _.keys(config))
192 changes: 152 additions & 40 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,38 @@ const Promise = require('bluebird')
const moment = require('moment')
const _ = require('lodash')
const tmdb = require('./lib/tmdb')
const fsCache = require('./lib/fs_cache')
const metacritic = require('./lib/metacritic')
const omdb = require('./lib/omdb')
const imdb = require('./lib/imdb')
const metacritic = require('./lib/metacritic')
const anthropic = require('./lib/anthropic')

const getTmdbDetails = function (movies) {
return Promise
.resolve(movies)
.mapSeries(function (movie) {
return tmdb.searchMovie(movie.title)
.then(function (tmdbMovie) {
tmdbMovie.metacritic_score = movie.score
return tmdbMovie
})
})
}

const getImdbId = function (tmdbId) {
return fsCache.wrap(tmdbId, function () {
return tmdb.getMovie(tmdbId)
.then(function (movie) {
return { imdbId: movie.imdb_id }
})
})
const getTmdb = function (tmdbId) {
return tmdb.getMovie(tmdbId)
}

const associateImdbIds = function (movies) {
const getTmdbDetails = function (movies) {
return Promise
.resolve(movies)
.mapSeries(function (movie) {
// we then need to map an imdb_id to each and every movie
return getImdbId(movie.id)
.then(function ({ imdbId }) {
movie.imdb_id = imdbId
movie.tmdb_id = movie.id
return movie
return getTmdb(movie.id)
.then(function (tmdbMovie) {
return _.assign(movie, {
tmdb_id: tmdbMovie.id,
imdb_id: tmdbMovie.imdb_id,
budget: tmdbMovie.budget === 0 ? null : tmdbMovie.budget,
revenue: tmdbMovie.revenue === 0 ? null : tmdbMovie.revenue,
top_actors: _.chain(tmdbMovie.credits.cast)
.take(3)
.map('name')
.value(),
director: tmdbMovie.credits.crew.find(c => c.job === 'Director')?.name,
production_companies: _.map(tmdbMovie.production_companies, 'name'),
genres: _.chain(tmdbMovie.genres)
.map('name')
.map(name => _.snakeCase(name).toLowerCase())
.join(', ')
.value()
})
})
})
}
Expand Down Expand Up @@ -70,15 +66,107 @@ const getOmdbRatings = function (movies) {
})
}

const uniqueMovies = function (movies) {
return _.uniqBy(movies, 'imdb_id')
const normalizeTitle = function (title) {
return title.replace(/[^\w]/gi, '').toLowerCase()
}

const getMetacriticRatings = async function (movies) {
const metacriticMovies = await metacritic()

const mappedMovies = _.chain(metacriticMovies)
.keyBy(m => normalizeTitle(m.title))
.mapValues('score')
.value()

return movies.map(function (movie) {
return _.assign(movie, {
metacritic_score: mappedMovies[normalizeTitle(movie.title)]
})
})
}

const evaluateMovies = async function (movies) {
const system = `
You are a movie critic that is given a list of movies released in the last 4 months. Your goal is to suggest and sort order the most popular movies.
You will be given a list of movies with the following details:
- Title
- Production Companies
- Release Date
- Genres
- Budget
- Revenue
- Metacritic Score (0-100)
- Rotten Tomatoes Score (0-100)
- IMDb Rating (0-10)
- IMDb Vote Count
- TMDB Score (0-10)
- TMDB Vote Count
- Top 3 actors in the movie
- Director
- Writer
When considering the popularity of a movie, consider the following:
- The budget of the movie and and how much revenue it made. Don't consider ROI, just consider how large the spend or revenue is.
- The number of votes the movie received and the rating of the movie.
- The production companies of the movie and the quality of the movies they have produced, and how well known the companies are.
- The actors & directors in the movie and how well known they are.
A null value means that the data could not be found or isn't publicly available.
Return the IDs of the most popular movies, in sorted order, in a JSON array, without comments.
Include, at most, 15 movies.
Your response should look similar to:
\`\`\`json
[
123,
456,
789
]
\`\`\`
`

const moviesData = movies.map(function (movie) {
return _.pick(movie, [
'id',
'title',
'production_companies',
'release_date',
'genres',
'budget',
'revenue',
'metacritic_score',
'imdb_rating',
'imdb_votes',
'rt_score',
'vote_average',
'vote_count',
'top_actors',
'director'
])
})

const response = await anthropic.prompt(system, JSON.stringify(moviesData))

const suggestedMovies = _.map(response, id => movies.find(movie => movie.id === id))

return suggestedMovies
}

const sanatizeForResponse = function (movies) {
return Promise
.resolve(movies)
.map(function (movie) {
return _.pick(movie, ['title', 'tmdb_id', 'imdb_id', 'poster_url'])
return _.pick(movie, [
'title',
'tmdb_id',
'imdb_id',
'poster_url'
])
})
}

Expand All @@ -98,6 +186,18 @@ const filterByMaxValue = function (key, value = 0) {
}
}

const rejectArrayValues = function (key, values) {
return function (movies) {
if (_.isNil(values)) {
return movies
}

return _.reject(movies, function (movie) {
return values.some(value => _.get(movie, key, []).includes(value))
})
}
}

const calculateMovieAge = function (movies) {
return _.map(movies, function (movie) {
movie.age = moment().diff(movie.release_date, 'days')
Expand All @@ -119,7 +219,14 @@ const logger = function (movies) {
'rt_score',
'popularity',
'vote_average',
'vote_count'
'vote_count',
'genres',
'budget',
'revenue',
'production_companies',
'top_actors',
'director',
'writer'
])
}

Expand All @@ -136,15 +243,12 @@ module.exports = (function () {
}

return Promise
.resolve(metacritic())
.then(getTmdbDetails)
.then(filterByMinValue('vote_count', 10))
.then(filterByMinValue('popularity', 30))
.resolve(tmdb.getMovies())
.then(calculateMovieAge)
.then(filterByMinValue('age', 21))
.then(filterByMaxValue('age', 365))
.then(associateImdbIds)
.then(uniqueMovies)
.then(filterByMaxValue('age', 120))
.then(filterByMinValue('age', 0))
.then(getTmdbDetails)
.then(getMetacriticRatings)
.then(getOmdbRatings)
.then(getImdbRatings)
.tap(logger)
Expand All @@ -161,6 +265,14 @@ module.exports = (function () {
.then(filterByMinValue('metacritic_score', opts.min_metacritic_score))
.then(filterByMinValue('rt_score', opts.min_rt_score))
.then(filterByMinValue('imdb_rating', opts.min_imdb_rating))
.then(rejectArrayValues('genres', opts.exclude_genres))
.then(sanatizeForResponse)
}

ListBuilder.prototype.evaluate = function () {
return Promise
.resolve(getMovies())
.then(evaluateMovies)
.then(sanatizeForResponse)
}

Expand Down
36 changes: 36 additions & 0 deletions lib/anthropic.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
const Anthropic = require('@anthropic-ai/sdk')
const config = require('../config')

const client = new Anthropic({
apiKey: config.ANTHROPIC_API_KEY
})

module.exports.prompt = async function (system, content) {
const response = await client.messages.create({
max_tokens: 4096,
system,
messages: [{
role: 'user',
content
}],
temperature: 0.0,
model: 'claude-3-5-sonnet-latest'
})

console.log(response)

const lines = response.content[0].text.split('\n').map(line => line.trim())

const bodyStartIndex = lines.findIndex(line => line === '```json' || line === '```')
if (bodyStartIndex === -1) {
throw new Error('Response does not contain a code block')
}

const bodyEndIndex = lines.indexOf('```', bodyStartIndex + 1)
if (bodyEndIndex === -1) {
throw new Error('Response does not contain a closing code block')
}

const data = JSON.parse(lines.slice(bodyStartIndex + 1, bodyEndIndex).join('\n').trim())
return data
}
7 changes: 4 additions & 3 deletions lib/metacritic.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@ const cheerio = require('cheerio')
const Promise = require('bluebird')
const request = Promise.promisify(require('request'))
const _ = require('lodash')
const moment = require('moment')

async function metacriticRequest (opts) {
return await request(_.defaultsDeep(opts, {
url: 'https://www.metacritic.com/browse/movie',
url: 'https://www.metacritic.com/browse/movie/all/all/all-time/new/',
headers: {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
},
qs: {
releaseType: 'in-theaters',
releaseYearMin: 2022,
releaseYearMin: moment().subtract(12, 'months').year(),
page: 1
}
}))
Expand Down Expand Up @@ -54,7 +55,7 @@ module.exports = async function () {
const movies = []
let page = 1
let movieResults = await getMovies(page)
while (movieResults.length > 0 && page < 4) {
while (movieResults.length > 0 && page < 20) {
movies.push(...movieResults)
page++
movieResults = await getMovies(page)
Expand Down
Loading

0 comments on commit a3ea378

Please sign in to comment.