slog.sheaflabs.com

discourse topic and post backup as a user

[2023-06-22] #code #websites


If you’ve ever had the need to backup a third-party hosted discourse.org forum under risk of censorship or EOL, but do not have admin or user API access, you may find this useful.

In this particular instance, the forum uses WordPress as the authoritative source of discourse users and DiscourseConnect plugin method to enable SSO auth between the two.

The script in simple terms:

This script in its current is not really useful to backup and restore a forum, only to preserve raw content.

<?php

/*
In CLI run:
touch website-cookies.txt ; php script.php > all.json
*/

$membersLogin = array(
    'log' => '[email protected]', // members wordpress email
    'pwd' => 'some-secure-password', // members wordpress password
);

function dc_get($_url) {	
	$ch = curl_init($_url);

	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	curl_setopt ($ch, CURLOPT_COOKIEJAR, "website-cookies.txt");
	curl_setopt($ch, CURLOPT_COOKIEFILE, "website-cookies.txt");

	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);

	$output = curl_exec($ch);
	curl_close($ch);
	// takes some time but avoids brute protections
	sleep(1);
return json_decode($output,true);
}

// members login
$ch = curl_init("https://members.forum.ext/wp-login.php");

curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($membersLogin));

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_COOKIEJAR, "website-cookies.txt");
curl_setopt($ch, CURLOPT_COOKIEFILE, "website-cookies.txt");
$output = curl_exec($ch);
curl_close($ch);

// community sso
$ch = curl_init("https://community.forum.ext/session/sso?return_path=/login");

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_COOKIEJAR, "website-cookies.txt");
curl_setopt($ch, CURLOPT_COOKIEFILE, "website-cookies.txt");

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);

$output = curl_exec($ch);
curl_close($ch);

// latest topics
$_page = 0;
$_done = false;
while(!$_done) {
	$_lt = dc_get("https://community.forum.ext/latest.json?no_definitions=true&page=".$_page);
	if(!empty($_lt['topic_list']['more_topics_url'])) {
		$_page += 1;
	} else {
		$_done = true;
	}
	foreach($_lt['topic_list']['topics'] as $_k => $_t) {
		$_topics[$_t['id']] = $_t;
	}
}

foreach($_topics as $_k => $_t) {
	$_streams = dc_get("https://community.forum.ext/t/".$_t['id'].".json");
	$_topics[$_k]['post_stream'] = $_streams['post_stream']['stream'];
	foreach($_streams['post_stream']['stream'] as $_ks => $_s) {
		$_post = dc_get("https://community.forum.ext/posts/".$_s.".json");
		$_topics[$_k]['posts'][] = $_post;
	}
}

echo json_encode($_topics);
?>

You can verify and work with the contents of your all.json in a standard way for example:

<?php

$_json = file_get_contents('all.json');
$_topics = json_decode($_json,true);

foreach($_topics as $_tk => $_t) {
	$_count['t']+=1;
	echo $_t['created_at']." ".$_t['title']."\n";
	if(!@empty($_t['posts'])) {
		foreach($_t['posts'] as $_pk => $_p ){
			echo "\t"." ".$_p['created_at']." "."@".$_p['username']."\t\t\t".trim(preg_replace("/\r|\n/", "", substr(strip_tags($_p['cooked']),0,50)))."\n";
			$_count['p']+=1;
		}
	}
}
print_r($_count);
?>

← back to index