From 717e545e349faae7ef9c2b3f89969166306d7deb Mon Sep 17 00:00:00 2001 From: Pranshu Sharma Date: Wed, 14 May 2025 22:42:33 +1000 Subject: [PATCH] inital commit --- README.md | 55 ++++ genorg | 771 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 826 insertions(+) create mode 100644 README.md create mode 100755 genorg diff --git a/README.md b/README.md new file mode 100644 index 0000000..8bd909e --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ + +# Dependencies + +Gnome libxml2 + +# Usage + +## Templates + +This uses Template::Toolkit for template syntax. + +You must now have any value in `[no_name_dir]` in blog_dir. + + +# Concepts + +Orignally the genorg that was made had the concept that moving files +was natural and should not affect the system. + +## Static file tree + +Contradictory to orignal genorg, this is designed for the blog org +tree to remain static. The orignal reason I wanted this was that so +the blog directory tree would be easy to try. But then I realised, +it's supposed to be write-only. For a linear blog, you don't really +make modifications to article once it's published, and you should be +able to find new + + +## Caching + +This uses a 2 step caching procsess. + +``` + Input Cache Output + +-----------+ +---------------+ +-----------+ + | Org files | ----> | direct org | ----> | Modified | + | | ----> | html export | ----> | html | + +-----------+ +---------------+ +-----------+ + | | + +---------------+ +-----------+ + | Neccasary info| ----> | navigtion | + | eg title, date| ----> | files | + +---------------+ +-----------+ +``` + +The intermideate transformation of the direct org to html is stored +for caching. The neccasry info like path and all is stord in seperate +xml document to not make it so the whole file has to be parsed again +to link. + +## Speed for space + +`genorg` copies the whole file tree accross to cache in intermiediate +stage, this allows for the `-m` option. diff --git a/genorg b/genorg new file mode 100755 index 0000000..1ba8940 --- /dev/null +++ b/genorg @@ -0,0 +1,771 @@ +#!/usr/bin/perl + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# . + +# Copyright (C) 2025 Pranshu Sharma + +=head1 NAME + +genorg - generate a blog from org files + +Author: Pranshu Sharma + +=head1 SYNOPSIS + + --verbose,-v Be verbose. False by default + --config,-c Specify config file. genconf by default + --mid-cache,-m Reuse the intermediate cache + --disable-cache,-d Do not use the cache + --help,-h Print this help + +=head1 VERSION + +0.01 + +=cut + +# Add caching, fun + + +use v5.40.1; + +use warnings; +use strict; + +use feature qw(defer class); +no warnings qw( experimental::class experimental::defer ); + +use Symbol 'gensym'; +use Getopt::Long qw(:config auto_help); +use Path::Tiny qw(path cwd); +use Time::Piece; +use List::Util qw(any pairmap first); +use IPC::Open3; + +use Template; +# We need this for writing the file to comunicate with server +use JSON::XS; + +my @files; +our $Conf; +our $tt; + +GetOptions( + 'config|c=s' => \ (my $config_file = "genorg.toml"), + 'disable-cache|d' => \ our $use_cache, + 'mid-cache|m' => \(our $mid_cache), + 'verbose|v' => \ our $verbose + ); + +$Conf = Config->new(file => $config_file); +$mid_cache = !$mid_cache; +$use_cache = !$use_cache; +if (!$use_cache) { + $mid_cache = 0; +} + +my @cached_arts; + +sub process_file ($path, $state) { + if (-d $path) { + # We need to copy the *exact* file structure to output + mkdir ($Conf->out_dir . "/" . $path->relative($Conf->blog_dir)); + return; + } + # For now, we just need the org files. As when we export the org + # files, that might generate new files. + $path =~ /\.org$/ or return; + + if ($use_cache) { + for (@cached_arts) { + return if (path($_->file_path) eq $path->realpath) + } + } + + push @files, $path; +} + +# TODO fill this funciton +sub read_cache { + if (-d $Conf->cache_dir) { + } else { + # There is no cache + $use_cache = 0; + } +} + +sub get_cache_new_filenames ($n) { + return (1..$n) if !@cached_arts || !$use_cache; + my $bcwd = cwd->stringify; + chdir $Conf->cache_dir; + + my @occupied = map + { path($_->mid_file)->relative($Conf->cache_dir) } + @cached_arts; + + my @free; + my $cn = 1; + while ((0+@free) != $n) { + unless (any { $_ eq "$cn" } @occupied) { + push @free, $cn; + } + ++$cn; + } + + chdir $bcwd; + $|++; + return @free; +} + +# We process the $file, and put it in $Conf->template_main +sub process_template ($file, $data) { + $tt->process($file, $data, \(my $stuff)); + $stuff +} + +sub run_emacs (@org_files) { + my @output_files = + map { $Conf->cache_dir . "/$_" } + get_cache_new_filenames(0+@files); + + sub listify (@list) { + '(' . (join "", map "\"$_\" ", @list) . ')' + } + my $ox = $Conf->ox; + my $in = listify @org_files; + my $out = listify @output_files; + mkdir $Conf->cache_dir; + # Fix package-dir + my ($emacs, $package_dir, $init_file) = + ($Conf->emacs, $Conf->package_dir , $Conf->init_file); + # I keep the postable as it is needed to get data and author and all that + my $init_code = <<"ELISP" =~ s/\n//gr; +(require 'org) + +(org-link-set-parameters "g" + :follow nil + :export + #'(lambda (link desc _ _) + (let ((q (char-to-string 34))) + (concat "" desc ""))) + :store nil) + +(setq org-html-head "" + org-html-head-extra "" + org-html-head-include-default-style nil + org-html-head-include-scripts nil + org-html-preamble nil + org-html-use-infojs nil) +(setq make-backup-files nil) + +(setq in-files '$in) +(setq out-files '$out) +(setq package-dir "$package_dir") +(setq dd default-directory) +(dolist (dir (directory-files package-dir t)) + (add-to-list 'load-path dir)) +(dolist (in in-files) + (find-file dd) + (setq out (pop out-files)) + (find-file in) + (setq string (org-export-as '$ox)) + (find-file out) + (delete-region (point-min) (point-max)) + (insert string) + (save-buffer)) +(princ "DONE" #'external-debugging-output) +ELISP + open3(undef, undef, my $emacs_err = 'gensym', + $Conf->emacs, + ("--load", $init_file) x !!$init_file, + qw(-Q --batch --eval), "(progn $init_code)") + or die "Couldn't open emacs\n"; + + my $done; + while (<$emacs_err>) { + do { $done = true ; last} if $_ eq "DONE"; + print "E-> ", $_; + } + if (!$done) { + die "Couldn't complete emacs task\n"; + } + return @output_files; +} + +our %article; +our $cache_dom; + +sub main { + $|++ if $verbose; + # read the config ----------------------------- + my $conf_dir = path($config_file)->dirname; + chdir $conf_dir + or die "Not valid config path\n"; + $Conf->saturate; + mkdir $Conf->out_dir; + my $cache_file = $Conf->cache_dir . "/cache.xml"; + $use_cache = 0 unless -f $cache_file; + if ($use_cache) { + -f $cache_file or $use_cache = 0; + $cache_dom = XML::LibXML->load_xml(location => $cache_file) + or die "Couldn't parse cache file"; + # We need to read the cache over here, and see if any files are no + # use. + for my $cat_node ($cache_dom->findnodes('/data/cat')) { + my $cat = $cat_node->{name}; + for my ($xart) ($cat_node->getChildrenByTagName("art")) { + my $id = $xart->{id}; + + local *value = sub ($n) { + $xart->getChildrenByTagName($n)->[0]->to_literal + }; + + my $mid_file = $Conf->cache_dir . "/" . value("mid_file"); + my $file_path = $Conf->blog_dir . "/" . value("file_path"); + if ((!-f $file_path) + || ((stat($file_path))[9] >= (stat($mid_file))[9]) + ) { + say "Changes made to $cat->$id" if $verbose; + # This means we pretend like this is normal file + } else { + # We just saturate cache + my $art = Article->new(file => $mid_file, path => $file_path); + $art->pinit( + id => $id, + cat => $cat, + date => value("date"), + title => value("title") + ); + $article{$cat}{$id} = $art; + push @cached_arts, $art; + } + } + } + } + + # Read blog dir and get relevent files --------- + # saturate @files with org files to export + path($Conf->blog_dir)->visit(\&process_file, {recurse => 1}); + die "No changed made\n" unless !$mid_cache || @files; + + ($use_cache && !$mid_cache) && @files && die("New files to convert, canot use -m\n"); + + # Run emacs and covert @files to @mid_files ---- + say "Runing emacs" if $verbose; + @files and my @mid_files = run_emacs(@files); + + + if (!$mid_cache) { + push @files, (map { $_->file_path } @cached_arts); + } + + # Now we parse and get information about these files + my %export_files; + @export_files{@files} = @mid_files; + for my ($path, $html) (%export_files) { + # This means this file is already cached. + my $art = + (first { $_->file_path eq $path } @cached_arts) + // + Article->new(file => $html, path => $path); + try { + $verbose && + say("Parsing file " . path($path)->relative($Conf->blog_dir)); + $art->parse; + } catch ($e) { + print "Error when parsing: $e" + } + $article{$art->cat}{$art->id} = $art; + } + + # Now we copy the files from tree to out dir + chdir $Conf->blog_dir; + mkdir $Conf->out_dir; + + # We make a list of files to round up and move for later + my %files_to_move; + path('.')->visit( + sub ($path, $state) { + return if /\.org$/; + if (-d $path) { + $files_to_move{$path} = undef; + } + else { + $files_to_move{$path->realpath} = $path->stringify; + } + } + , {recurse => 1}); + chdir $Conf->out_dir; + + # Setting up Template::Toolkit stuff + + $tt = Template->new({ + INCLUDE_PATH => $Conf->template_dir, + INTERPOLATE => 1, + }) or die "$Template::ERROR\n"; + + my @temp; + my %data = ( + cats => {map { $_ => {name => $_, + link => "/" . $Conf->oblog_dir . "/$_", + rss => "/" . $Conf->orss_dir . "/$_"}} + (keys %article)}, + cat => {pairmap + { + $a => + [sort { $b->{numdate} <=> $a->{numdate} } + (pairmap { + unshift @temp, $b->info_hash; + $temp[0] + } %{$b})] + } + %article}, + all_arts => [sort {$b->{numdate} <=> $a->{numdate}} @temp], + main_dir => "/" . $Conf->oblog_dir, + main_rss => "/" . $Conf->orss_dir + ); + chdir $conf_dir; + + # Ok, now we need to copy the mid html files + for my $art (map {values %{$_}} (values %article)) { + # if $mid_cache is off, we redo files that are already done + if ($mid_cache) { + next if any {$_ == $art} @cached_arts; + } + + $tt->process( + $Conf->template_art, + {%data, + current_cat => $art->cat, + current_id => $art->id, + body => $art->text(), + $art->info_hash->%*}, + \(my $content)); + + path($Conf->out_dir . "/" . + path($art->new_path)->relative($Conf->blog_dir))->spew_utf8($content); + } + + # I need to make 3 special files: + # - article: body + # - cat: Current category + # - layout: content + + say "Processing main blog template" if $verbose; + my $blog = process_template ($Conf->template_blog, \%data); + + chdir $Conf->out_dir; + for ($Conf->otemplate_dir) { + mkdir $_; chdir $_; + } + + path($Conf->template_blog)->spew($blog); + my $rss_fh = path("main.xml")->openw; + defer { close $rss_fh } + gen_rss($rss_fh, [map {values %{$_}} (values %article)]); + # Let's go back to main outdir + + # Now we need to make an INFO json file for genserve + + my $land_cont = + process_template($Conf->template_land, {%data}); + + path($Conf->template_land)->spew($land_cont); + + my %info = ( + blog_file => $Conf->otemplate_dir . '/' . $Conf->template_blog, + landing => $Conf->otemplate_dir . '/' . $Conf->template_land, + files => [], + cpath => [], + blog_dir => "/" . $Conf->oblog_dir . "/", + rss_dir => "/" . $Conf->orss_dir . "/:cat" , + main_rss => $Conf->otemplate_dir . '/main.xml', + main_rss_url => "/" . $Conf->orss_dir + ); + + my $cfile = 1; + for my ($cat, $arts) (%article) { + for my ($id, $art) (%{$arts}) { + push (@{$info{files}} , + {cat => $cat, + art => $art->id, + file => path($art->new_path)->relative($Conf->blog_dir)->stringify}); + } + # We also need to make the template + my $cont = + process_template($Conf->template_cat, {%data, current_cat => $cat}); + my $p = $cfile++ . ".html"; + + path($p)->spew($cont); + my $art_path = path($p)->realpath->relative($Conf->out_dir)->stringify; + push (@{$info{cpath}}, + {cat => $cat, + file => $art_path, + rss => "$art_path.xml"}); + my $fh = path("$p.xml")->openw; + defer { close $fh } + gen_rss($fh, [values %{$arts}], $cat); + } + + chdir $Conf->out_dir; + path($Conf->info_file)->spew(encode_json(\%info)); + + + # Remember we rounded up %files_to_move? + # Now we move them + for my ($from, $to) (%files_to_move) { + if ($to) { + if ($from =~ /\.html\.tt$/) { + # File is a template + $verbose && + say("Processing template file ", + path($from)->relative($Conf->blog_dir)) + ; + my $cont = path($from)->slurp; + $tt->process(\$cont, \%data, \(my $stuff)); + path($to =~ s/\.tt$//r)->spew( $stuff); + } else { + path($from)->copy($to); + } + } else { + mkdir $from; + next; + } + } + + # Now we saturate the cache + my $dom = XML::LibXML::Document->new('1.0', 'UTF-8'); + my $xmain = $dom->createElement('data'); + for my ($cat, $arts) (%article) { + my $xcat = $dom->createElement('cat'); + $xcat->{name} = $cat; + for my ($id, $art) ($arts->%*) { + my $xart = $dom->createElement('art'); + $xart->{id} = $id; + + my $xadd = sub ($name, $value) { + my $xel = $dom->createElement($name); + $xel->appendText($value); + $xart->appendChild($xel); + }; + # ---- + &$xadd('date', $art->date->epoch); + &$xadd('title', $art->title); + &$xadd('mid_file', path($art->mid_file)->relative($Conf->cache_dir)); + &$xadd('file_path', path($art->file_path)->relative($Conf->blog_dir)); + # ---- + $xcat->appendChild($xart); + } + $xmain->appendChild($xcat); + } + $dom->setDocumentElement($xmain); + + path($Conf->cache_dir . "/cache.xml" )->spew($dom->toString(1)); + + say "All done" if $verbose; + +} + +main; + +sub gen_rss ($fh, $arts, $cat //= 0) { + my $title = $Conf->title; + my $desc = $Conf->desc; + my $domain = $Conf->domain; + print $fh <<"EOX" =~ s/\n//gr; + + + +EOX + my $mid_cont; + if ($cat) { + $mid_cont = <<"EOX" + $title: $cat + $domain + $desc. Category: $cat +EOX + } else { + $mid_cont = <<"EOX" + $title + $domain + $desc +EOX + } + print $fh ($mid_cont =~ s/\n//gr); + + for my $art (@{$arts}) { + my ($cat, $id) = ($art->cat, $art->id); + my $link = "/" . $Conf->oblog_dir . "/$cat/$id"; + my $art_title = $art->title; + my $time = $art->date->strftime($Conf->time_format); + print $fh <<"EOX" =~ s/\n[ \t]+//gr; + + $art_title + $domain$link + $time + +EOX + } + print $fh ""; +} + +class Article { + use List::Util qw(first); + use Path::Tiny; + use Time::Piece; + use XML::LibXML; + + field $mid_file :param(file) :reader(mid_file); + field $file_path :reader :param(path); + field $new_path :reader; + # The full thing is wayy too hrd to spell + field $cat :reader; + field $id :reader; + field $title :reader; + field $date :reader; + field $dom; + field %related; + + ADJUST { + $new_path = "$file_path.html"; + } + + method text { + $dom or $dom = XML::LibXML->load_html(location => $mid_file, + recover => 1, + suppress_errors => 1); + my ($body) = $dom->findnodes('/html/body'); + $body->toString + } + + method info_hash { + # Get all the relevent info in hash, for templating use. + return { + # TODO related articles + # TODO custom date format + date => $date->strftime($Conf->time_format), + numdate => $date->epoch, + title => $title, + link => "/" . $Conf->oblog_dir . "/$cat/$id", + id => $id, + cat => $cat + } + } + + method pinit (%i) { + # id cat date title + $date = localtime($i{date}); + $cat = $i{cat}; + $title = $i{title}; + $id = $i{id}; + } + + sub parse_desc ($str) { + $str || die "No description content provided\n"; + my @return = (); + while ($str =~ s/(^.*?\s*->\s*\w+)\s*//) { + @return = (@return, split(/->\s*/, $1)); + } + @return; + } + + method parse { + $dom = XML::LibXML->load_html(location => $mid_file, + recover => 1, + suppress_errors => 1) + or die "Couldn't parse\n"; + + my ($title_node) = $dom->findnodes("/html/head/title") or die "No title\n"; + $title = $title_node->to_literal; + + my $date_node = + first {$_->to_literal =~ /^Date: /} + ($dom->findnodes('/html/body/*[@id="postamble"]/p[@class="date"]')); + + $date_node or die "No date\n"; + $date = Time::Piece->strptime($date_node->to_literal + =~ s/^Date: ([^ ]+).+$/$1/r, + "%F") or die "Couldn't parse date\n"; + + # We don't need the postamble at all. + $_->parentNode->removeChild($_) for ($date_node->parentNode); + + my ($desc_node) = $dom->findnodes('/html/head/meta[@name="description"]') + or die "Description doesn't exist\n"; + ($cat, $id, %related) = parse_desc $desc_node->{content}; + + # Now, we just need to make the image path an full path for each + # thing. + for my $node ($dom->findnodes("//img")) { + my $src = $node->{src}; + next if $src =~ /^http/; + $src =~ s|file://||; + if ($src =~ m[^/]) { + $src = "/" . path($src)->relative($Conf->blog_dir) + } else { + $src = "/" . + path(path($file_path)->dirname + . ("/$src" =~ s#\./##r))->relative($Conf->blog_dir); + } + $node->{src} = $src; + } + } + +} + +class Config { + use TOML qw(from_toml); + use List::Util qw(any pairmap pairgrep); + use Path::Tiny qw(cwd path); + + field $file :param :reader; + + field $time_format :reader = "%Y-%m-%d"; + + field $oblog_dir :reader; + field $otemplate_dir :reader; + field $orss_dir :reader; + + field %no_names = ( + blog => [\$oblog_dir, 'blog', 'dir_bn'], + templates => [\$otemplate_dir, 'templates', 'dir_bn'], + rss => [\$orss_dir, 'rss', 'dir_bn'] + ); + + field $out_dir :reader; + field $blog_dir :reader; + field $cache_dir :reader; + field $info_file :reader; + + field %paths = ( + out_dir => [\$out_dir, 0,'dir'], + blog_dir => [\$blog_dir, 0,'existing_dir'], + cache_dir => [\$cache_dir, '.genrog-cache', 'dir'], + info_file => [\$info_file, '.genorg.json', 'file_bn'] + ); + + field $emacs :reader; + field $init_file :reader; + # TODO make this an array + field $package_dir :reader; + field $ox :reader; + + field %emacs_conf = ( + emacs => [\$emacs, qw(emacs program)], + init_file => [\$init_file, 0, 'existing_file'], + package_dir => [\$package_dir, $ENV{HOME} . '/.emacs.d/elpa', 'existing_dir'], + ox => [\$ox, "html", "symbol"] + ); + + field $template_dir :reader; + # These 2 are special templates + field $template_art :reader; + field $template_cat :reader; + field $template_main :reader; + field $template_blog :reader; + field $template_land :reader; + + field %template_files = ( + template_dir => [\$template_dir, 'templates', 'existing_dir'], + template_main => [\$template_main, 'layout.html.tt', 'existing_file_bn'], + template_blog => [\$template_blog, 'blog.html.tt', 'existing_file_bn'], + template_art => [\$template_art, 'art.html.tt', 'existing_file_bn'], + template_cat => [\$template_cat, 'cat.html.tt', 'existing_file_bn'], + template_land => [\$template_land, 'index.html.tt', 'existing_file_bn'], + ); + + field $title :reader; + field $domain :reader; + field $desc :reader; + + field %info = ( + title => [\$title, 0, 'symbol'], + domain => [\$domain, 0, 'symbol'], + desc => [\$desc, 0, 'symbol'] + ); + + # Saturate the path variables + # TODO add tilda support + use Data::Dumper; + sub get_paths ($data, $info, $section) { + my %vars = (map {($_, 0)} keys %$info); + for my ($var, $val) (%$data) { + exists $info->{$var} + or die " Unknown directive, '$var'\n"; + + my ($ref, $default, $type) = $info->{$var}->@*; + + validate_type($val, $type); + $val = change_if_fs($val, $type); + + $$ref = $val; + $vars{$var}++; + } + my %not_done = pairgrep { !$b } %vars; + for (keys %not_done) { + my ($ref, $default, $type) = @{$info->{$_}}; + validate_type($default, $type); + $default = change_if_fs($default, $type); + $$ref = $default || die "$_ expected in [$section]\n"; + } + } + + sub change_if_fs ($v, $t) { + if ($t =~ /dir$/ or $t =~ /file$/) { + return path($v)->realpath->stringify + } + return $v + } + + sub validate($v, $t) { + return if validate_type($v, $t); + (die "$v must be " . { + 'existing_file' => "an existing file", + 'existing_dir' => "an existing directory" + }->{$t} . "\n") + } + + sub validate_type ($value, $type) { + for ($type) { + /^existing_dir$/ && do { return -d $value }; + /^dir(_bn)?$/ && do { return true }; + /^program$/ && do { return true }; + /^existing_file(_bn)?$/ && do { return -f $value }; + # Symbol + return 1; + } + } + + method saturate { + say "Reading config" if $verbose; + + try { + my $data = from_toml(path($file)->slurp); + # TODO refactor + get_paths(($data->{paths} // die "[paths] needed\n") ,\%paths, "paths"); + get_paths(($data->{emacs} // die "[emacs] needed\n") ,\%emacs_conf, "emacs"); + get_paths(($data->{templates} // die "[templates] needed\n") ,\%template_files, "templates"); + get_paths(($data->{info} // die "[info] needed\n") ,\%info, "info"); + get_paths(($data->{no_names} // die "[no_names] needed\n") ,\%no_names, "no_names") + + } catch ($e) { + die "Error when reading config: $e"; + } + + say "Done reading config" if $verbose; + } +} + +