From b9d6aa4639e54a78257ced0bc13e72ae3deea5d6 Mon Sep 17 00:00:00 2001 From: Julian Lam Date: Mon, 29 Apr 2024 16:16:07 -0400 Subject: [PATCH] feat: slightly better title generation --- install/package.json | 1 + src/activitypub/helpers.js | 27 +++++++++++++++++++++++++++ src/activitypub/notes.js | 5 +---- test/activitypub.js | 29 +++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/install/package.json b/install/package.json index 0385337da9..0566ce5bf2 100644 --- a/install/package.json +++ b/install/package.json @@ -46,6 +46,7 @@ "bootswatch": "5.3.3", "chalk": "4.1.2", "chart.js": "4.4.2", + "cheerio": "^1.0.0-rc.12", "cli-graph": "3.2.2", "clipboard": "2.0.11", "colors": "1.4.0", diff --git a/src/activitypub/helpers.js b/src/activitypub/helpers.js index 4ab32d5c03..6b7e73c61b 100644 --- a/src/activitypub/helpers.js +++ b/src/activitypub/helpers.js @@ -4,7 +4,9 @@ const { generateKeyPairSync } = require('crypto'); const winston = require('winston'); const nconf = require('nconf'); const validator = require('validator'); +const cheerio = require('cheerio'); +const meta = require('../meta'); const posts = require('../posts'); const categories = require('../categories'); const request = require('../request'); @@ -252,3 +254,28 @@ Helpers.resolveObjects = async (ids) => { })); return objects.length === 1 ? objects[0] : objects; }; + +Helpers.generateTitle = (html) => { + // Given an html string, generates a more appropriate title if possible + const $ = cheerio.load(html); + let title; + + // Try the first paragraph element + title = $('h1, h2, h3, h4, h5, h6, title, p, span').first().text(); + + // Fall back to newline splitting (i.e. if no paragraph elements) + title = title || html.split('\n').filter(Boolean)[0]; + + // Split sentences and use only first one + const split = title.split('. '); + if (split.length > 1) { + title = split.shift(); + } + + // Truncate down if too long + if (title.length > meta.config.maximumTitleLength) { + title = `${title.slice(0, meta.config.maximumTitleLength - 3)}...`; + } + + return title; +}; diff --git a/src/activitypub/notes.js b/src/activitypub/notes.js index 327b51e31e..91f01c0bf6 100644 --- a/src/activitypub/notes.js +++ b/src/activitypub/notes.js @@ -71,10 +71,7 @@ Notes.assert = async (uid, input, options = { skipChecks: false }) => { } else { // mainPid ok to leave as-is cid = options.cid || -1; - title = name || utils.decodeHTMLEntities(utils.stripHTMLTags(content)); - if (title.length > meta.config.maximumTitleLength) { - title = `${title.slice(0, meta.config.maximumTitleLength - 3)}...`; - } + title = name || activitypub.helpers.generateTitle(utils.decodeHTMLEntities(content)); } mainPid = utils.isNumber(mainPid) ? parseInt(mainPid, 10) : mainPid; diff --git a/test/activitypub.js b/test/activitypub.js index bdc88a4482..c6e60d4b13 100644 --- a/test/activitypub.js +++ b/test/activitypub.js @@ -106,6 +106,35 @@ describe('ActivityPub integration', () => { assert.strictEqual(id, uid); }); }); + + describe('.generateTitle', () => { + it('should take the first paragraph element\'s text', () => { + const source = '

Lorem ipsum dolor sit amet

consectetur adipiscing elit. Integer tincidunt metus scelerisque, dignissim risus a, fermentum leo. Pellentesque eleifend ullamcorper risus tempus vestibulum. Proin mollis ipsum et magna lobortis, at pretium enim pharetra. Ut vel ex metus. Mauris faucibus lectus et nulla iaculis, et pellentesque elit pellentesque. Aliquam rhoncus nec nulla eu lacinia. Maecenas cursus iaculis ligula, eu pharetra ex suscipit sit amet.

'; + const title = activitypub.helpers.generateTitle(source); + assert.strictEqual(title, 'Lorem ipsum dolor sit amet'); + }); + + it('should take the first line\'s text if no matched elements', () => { + const source = 'Lorem ipsum dolor sit amet\n\nconsectetur adipiscing elit. Integer tincidunt metus scelerisque, dignissim risus a, fermentum leo. Pellentesque eleifend ullamcorper risus tempus vestibulum. Proin mollis ipsum et magna lobortis, at pretium enim pharetra. Ut vel ex metus. Mauris faucibus lectus et nulla iaculis, et pellentesque elit pellentesque. Aliquam rhoncus nec nulla eu lacinia. Maecenas cursus iaculis ligula, eu pharetra ex suscipit sit amet.'; + const title = activitypub.helpers.generateTitle(source); + assert.strictEqual(title, 'Lorem ipsum dolor sit amet'); + }); + + it('should trim down the title if it is too long per settings', () => { + const value = meta.config.maximumTitleLength; + meta.config.maximumTitleLength = 10; + const source = '@@@@@@@@@@@@@@@@@@@@'; + const title = activitypub.helpers.generateTitle(source); + assert.strictEqual(title, '@@@@@@@...'); + meta.config.maximumTitleLength = value; + }); + + it('should take the first sentence of a matched element/line', () => { + const source = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam a ex pellentesque, fringilla lorem non, blandit est. Nulla facilisi. Curabitur cursus neque vel enim semper, id lacinia elit facilisis. Vestibulum turpis orci, efficitur ut semper eu, faucibus eu turpis. Praesent eu odio non libero gravida tempor. Ut porta pellentesque orci. In porta nunc eget tincidunt interdum. Curabitur vel dui nec libero tempus porttitor. Phasellus tincidunt, diam id viverra suscipit, est diam maximus purus, in vestibulum dui ligula vel libero. Sed tempus finibus ante, sit amet consequat magna facilisis eget. Proin ullamcorper, velit sit amet feugiat varius, massa sem aliquam dui, non aliquam augue velit vel est. Phasellus eu sapien in purus feugiat scelerisque congue id velit. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos.'; + const title = activitypub.helpers.generateTitle(source); + assert.strictEqual(title, 'Lorem ipsum dolor sit amet, consectetur adipiscing elit'); + }); + }); }); describe('ActivityPub screener middleware', () => {