Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update RMP Scraper from Trends #63

Merged
merged 2 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions src/data/builder.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { CourseHeader } from '~content';
import { requestProfessorFromRmp } from '~data/fetchFromRmp';

import { SCHOOL_ID } from './config';
import { SCHOOL_ID, SCHOOL_NAME } from './config';
import { fetchNebulaGrades, fetchNebulaProfessor } from './fetch';

export interface ProfessorProfileInterface {
Expand Down Expand Up @@ -43,9 +43,18 @@ export async function buildProfessorProfile(
);

const rmpsPromise = requestProfessorFromRmp({
professorName: professorSplit.profFirst + ' ' + professorSplit.profLast,
profFirst: professorSplit.profFirst,
profLast: professorSplit.profLast,
schoolId: SCHOOL_ID,
}).then((result) => (rmp = result));
schoolName: SCHOOL_NAME,
})
.then((result) => {
console.log(result);
if (result.message === 'success') {
rmp = result.data;
}
})
.catch((error) => console.error(error.message));

await Promise.all([
nebulaProfessorsPromise,
Expand Down
51 changes: 47 additions & 4 deletions src/data/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,58 @@ export const HEADERS = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Content-Type': 'application/json',
Referer: 'https://www.ratemyprofessors.com/',
};

export const PROFESSOR_QUERY = {
query:
'query RatingsListQuery($id: ID!) {node(id: $id) {... on Teacher {legacyId school {id} courseCodes {courseName courseCount} firstName lastName numRatings avgDifficulty avgRating department wouldTakeAgainPercent teacherRatingTags { tagCount tagName } ratingsDistribution { total r1 r2 r3 r4 r5 } }}}',
variables: {},
export const PROFESSOR_SEARCH_QUERY = {
query: `
query TeacherSearchQuery($query: TeacherSearchQuery!) {
newSearch {
teachers(query: $query) {
edges {
node {
id
legacyId
firstName
lastName
school {
id
name
}
department
avgRating
numRatings
avgDifficulty
wouldTakeAgainPercent
teacherRatingTags {
tagName
tagCount
}
ratingsDistribution {
total
r1
r2
r3
r4
r5
}
}
}
}
}
}
`,
variables: {
query: {
text: '',
schoolID: '',
},
},
};

export const SCHOOL_ID = '1273';
export const SCHOOL_NAME = 'The University of Texas at Dallas';

export const RMP_GRAPHQL_URL = 'https://www.ratemyprofessors.com/graphql';

export const neededOrigins = [
Expand Down
5 changes: 3 additions & 2 deletions src/data/fetch.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import fetchWithCache, {
cacheIndexGrades,
cacheIndexProfessor,
expireTime,
} from '~data/fetchWithCache';

interface FetchProfessorParameters {
Expand All @@ -23,7 +24,7 @@ export async function fetchNebulaProfessor(
},
},
cacheIndexProfessor,
2629800000,
expireTime,
)
.then((data) => {
if (data.message !== 'success') {
Expand Down Expand Up @@ -52,7 +53,7 @@ export async function fetchNebulaGrades(
},
},
cacheIndexGrades,
2629800000,
expireTime,
)
.then((data) => {
if (data.message !== 'success') {
Expand Down
207 changes: 61 additions & 146 deletions src/data/fetchFromRmp.ts
Original file line number Diff line number Diff line change
@@ -1,161 +1,76 @@
import { HEADERS, PROFESSOR_QUERY, RMP_GRAPHQL_URL } from '~data/config';
import fetchWithCache, { cacheIndexRmp } from '~data/fetchWithCache';

function reportError(context, err) {
console.error('Error in ' + context + ': ' + err);
}

function getProfessorUrl(professorName: string, schoolId: string): string {
const url = new URL(
'https://www.ratemyprofessors.com/search/professors/' + schoolId + '?',
); //UTD
url.searchParams.append('q', professorName);
return url.href;
}

function getProfessorId(text: string, professorName: string): string {
const lowerCaseProfessorName = professorName.toLowerCase();

let pendingMatch = null;
const regex =
/"legacyId":(\d+).*?"numRatings":(\d+).*?"firstName":"(.*?)","lastName":"(.*?)"/g;
const allMatches: string[] = text.match(regex);
const highestNumRatings = 0;

if (allMatches) {
for (const fullMatch of allMatches) {
for (const match of fullMatch.matchAll(regex)) {
console.log(
match[3].split(' ')[0].toLowerCase() +
' ' +
match[4].toLowerCase() +
' ',
);
const numRatings = parseInt(match[2]);
if (
lowerCaseProfessorName.includes(
match[3].split(' ')[0].toLowerCase() + ' ' + match[4].toLowerCase(),
) &&
numRatings >= highestNumRatings
) {
pendingMatch = match[1];
}
}
}
}

return pendingMatch;
}

function getGraphQlUrlProp(professorId: string) {
HEADERS['Referer'] =
`https://www.ratemyprofessors.com/ShowRatings.jsp?tid=${professorId}`;
PROFESSOR_QUERY['variables']['id'] = btoa(`Teacher-${professorId}`);
import { HEADERS, PROFESSOR_SEARCH_QUERY, RMP_GRAPHQL_URL } from '~data/config';
import fetchWithCache, {
cacheIndexRmp,
expireTime,
} from '~data/fetchWithCache';

function getGraphQlUrlProp(name: string, schoolID: string) {
PROFESSOR_SEARCH_QUERY.variables.query.text = name;
PROFESSOR_SEARCH_QUERY.variables.query.schoolID = btoa('School-' + schoolID);
return {
method: 'POST',
headers: HEADERS,
body: JSON.stringify(PROFESSOR_QUERY),
body: JSON.stringify(PROFESSOR_SEARCH_QUERY),
};
}

function wait(delay) {
return new Promise((resolve) => setTimeout(resolve, delay));
}

function fetchRetry(url: string, delay: number, tries: number, fetchOptions) {
function onError(err) {
const triesLeft: number = tries - 1;
if (!triesLeft) {
throw err;
}
return wait(delay).then(() =>
fetchRetry(url, delay, triesLeft, fetchOptions),
);
}
return fetchWithCache(url, fetchOptions, cacheIndexRmp, 2629800000).catch(
onError,
);
}

async function validateResponse(response, fetchOptions) {
const notOk = response?.status !== 200;
if (notOk && response && response.url) {
const details = {
status: response.status,
statusText: response.statusText,
redirected: response.redirected,
url: response.url,
};
reportError(
'validateResponse',
'Status not OK for fetch request. Details are: ' +
JSON.stringify(details),
);
// If we don't have fetch options, we just use an empty object.
response = await fetchRetry(response?.url, 200, 3, fetchOptions || {});
}
return response;
}

function fetchWithGraphQl(graphQlUrlProp, resolve) {
try {
fetchWithCache(
RMP_GRAPHQL_URL,
graphQlUrlProp,
cacheIndexRmp,
2629800000,
).then((response) =>
validateResponse(response, graphQlUrlProp).then((rating) => {
if (
rating != null &&
Object.hasOwn(rating, 'data') &&
Object.hasOwn(rating['data'], 'node')
) {
rating = rating['data']['node'];
}
resolve(rating);
}),
);
} catch (err) {
reportError('fetchWithGraphQl', err);
resolve(null); ///
}
}

export interface RmpRequest {
professorName: string;
profFirst: string;
profLast: string;
schoolId: string;
schoolName: string;
}
export function requestProfessorFromRmp(
request: RmpRequest,
): Promise<RMPInterface> {
export function requestProfessorFromRmp({
profFirst,
profLast,
schoolId,
schoolName,
}: RmpRequest): Promise<RMPInterface> {
profFirst = profFirst.split(' ')[0];
const name = profFirst + ' ' + profLast;
// create fetch object for professor
const graphQlUrlProp = getGraphQlUrlProp(name, schoolId);
return new Promise((resolve, reject) => {
// url for promises
const professorUrl = getProfessorUrl(
request.professorName,
request.schoolId,
);

// fetch professor id from url
fetchWithCache(
professorUrl,
{ method: 'GET' },
cacheIndexRmp,
2629800000,
true,
)
.then((text) => {
const professorId = getProfessorId(text, request.professorName);

// create fetch object for professor id
const graphQlUrlProp = getGraphQlUrlProp(professorId);

// fetch professor info by id with graphQL
fetchWithGraphQl(graphQlUrlProp, resolve);
// fetch professor info by name with graphQL
fetchWithCache(RMP_GRAPHQL_URL, graphQlUrlProp, cacheIndexRmp, expireTime)
.then((response) => {
if (
response == null ||
!Object.hasOwn(response, 'data') ||
!Object.hasOwn(response.data, 'newSearch') ||
!Object.hasOwn(response.data.newSearch, 'teachers') ||
!Object.hasOwn(response.data.newSearch.teachers, 'edges')
) {
reject({ message: 'Data for professor not found' });
return;
}
//Remove profs not at UTD and with bad name match
const professors = response.data.newSearch.teachers.edges.filter(
(prof: { node: RMPInterface }) =>
prof.node.school.name === schoolName &&
prof.node.firstName.includes(profFirst) &&
prof.node.lastName.includes(profLast),
);
if (professors.length === 0) {
reject({ message: 'Data for professor not found' });
return;
}
//Pick prof instance with most ratings
let maxRatingsProfessor = professors[0];
for (let i = 1; i < professors.length; i++) {
if (
professors[i].node.numRatings > maxRatingsProfessor.node.numRatings
) {
maxRatingsProfessor = professors[i];
}
}
resolve({
message: 'success',
data: maxRatingsProfessor.node,
});
})
.catch((error) => {
reportError('requestProfessorFromRmp', error);
reject(error);
reject({ message: error.message });
});
});
}
Expand Down
4 changes: 3 additions & 1 deletion src/data/fetchWithCache.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ const storage = new Storage({
//Increment these to reset cache on next deployment
export const cacheIndexProfessor = 0;
export const cacheIndexGrades = 0;
export const cacheIndexRmp = 0;
export const cacheIndexRmp = 1;

export const expireTime = 604800; //1 week

function getCache(key: string, cacheIndex: number) {
return new Promise((resolve, reject) => {
Expand Down
Loading