scrapboxとHugoを同期させる

📅 2022/8/22 · ☕ 2 min read

Scrapboxと個人ブログ(Hugo)を同期させるようにした.
- scrapboxとクローラでも言及したが, 空のリンクに検索がヒットするのはよくないと思い, 同期を始めた.
- scrapbox自体は書き心地やUXが最高で手放したくないため, 一部ページを同期させ, 正しく検索結果が載るか試してみる.
コードは以下に示す通り.
- scrapbox記法からmarkdownへの変換はこちらを改変したものを使用.
  - javascriptのreplaceは割と万能である.
- shell script / python / Node.js が動く環境があればOK.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19


url='https://scrapbox.io/api/pages/yuwd?limit=500'
# url='https://scrapbox.io/api/pages/yuwd?limit=10'
dates=(`curl $url | jq ".pages[].updated" | xargs`)
echo "COUNT=${#dates[@]}"
i=0
IFS=$'\n'
for title in $(curl $url | jq ".pages[].title" | sed -e "s/^\"//g" | sed -e "s/\"$//g")
do
    md_title=`echo $title | tr ' ' '_' | sed s/\?//g | sed s/!//g | sed s/://g`
    etitle=$(echo $title | python -c 'import sys;from urllib.parse import quote; [print(quote(l)[:-3],end="") for l in sys.stdin]')
    if [ ! -e $md_title.md ]; then
        echo $title
        url="https://scrapbox.io/api/pages/yuwd/${etitle}/text"
        echo $url
        curl $url | node sb2md.js > $md_title.md
        python scrapbox.py $md_title.md "${dates[$i]}"
    fi
    i=`expr $i + 1`
done

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69


// https://gist.github.com/yuntan/bb82cdf336ec76a15c66b910754f5f33

if (!Object.prototype.then) {
  Object.prototype.then = function (f) { return f.call(null, this); }
}

process.stdin.resume();
process.stdin.setEncoding('utf8');
let input_string = '';

process.stdin.on('data', chunk => {
  input_string += chunk;
});

process.stdin.on('end', () => {
  const text = input_string;
  console.log(sb2md(text));
});

function sb2md(text) {
  // code block
  const escapeCodeBlocks = s => s.replace(
    /^code:(.+)$((\n^[ \t].*$)+)/mg,
    (_, p1, p2) =>
      '```' + p1 + p2.replace(/^[ \t]/mg, '').replace(/\r|\n|\r\n/g, '
') + '
```'
  );

  const unescapeCodeBlocks = s => s.replace(/\+{3}/g, '\n');

  const replaceLine = line =>
    /^`{3}/.test(line) ? line :
      // level 2 heading
      line.replace(/^\[\[([^\[\]]+)\]\]$/, '## $1')
      .replace(/^\[\*\s+(\S[^\[\]]*)\]$/, '## $1')

      // anchor link
      .replace(/\[(\S.*)\s+(https?:\/\/\S+)\](https://scrapbox.io/yuwd/%28%5CS.%2A%29%5Cs%2B%28https%3F%3A%5C/%5C/%5CS%2B%29%5C)/g, '[$1]($2)')
      .replace(/\[(https?:\/\/\S+)\s+(\S.*)\](https://scrapbox.io/yuwd/%28https%3F%3A%5C/%5C/%5CS%2B%29%5Cs%2B%28%5CS.%2A%29%5C)/g, '[$2]($1)')

      // image block
      .replace(/^\[(https?:\/\/\S+\.(png|gif|jpe?g))\](https://scrapbox.io/yuwd/%28https%3F%3A%5C/%5C/%5CS%2B%5C.%28png%7Cgif%7Cjpe%3Fg%29%29%5C)$/, '![]($1)')
      .replace(/^\[(https:\/\/gyazo.com\/\S+)\](https://scrapbox.io/yuwd/%28https%3A%5C/%5C/gyazo.com%5C/%5CS%2B%29%5C)$/, '![]($1.png)')

      // unordered list
      .replace(/^\s{6}(\S.*)$/, '          - $1')
      .replace(/^\s{5}(\S.*)$/, '        - $1')
      .replace(/^\s{4}(\S.*)$/, '      - $1')
      .replace(/^\s{3}(\S.*)$/, '    - $1')
      .replace(/^\s{2}(\S.*)$/, '  - $1')
      .replace(/^\s(\S.*)$/, '- $1')

      // bold text
      .replace(/\[\[([^\[\]]+)\]\]/g, '**$1**')
      .replace(/\[\*\s+([^\[\]]+)\]/g, '**$1**')

      // italic text
      .replace(/\[\/\s+([^\[\]]+)\]/g, '*$1*');

  return text
    .then(escapeCodeBlocks)
    .split(/\r|\n|\r\n/)
    // first line is level 1 heading
    .then(lines => [lines[0].replace(/^(.+)$/, '$1')].concat(lines.slice(1)))
    .map(replaceLine)
    .join('\n')
    .then(unescapeCodeBlocks);
}

	import os
	import re
	import sys
	import datetime
	import requests
	from urllib.parse import quote

	head_pattern = re.compile(r" $You can't use 'macro parameter character #' in math mode$ ]+)\][^\(]")
	math_pattern = re.compile(r" $Missing superscript or subscript argument$ ]+)\]")
	link_pattern = re.compile(r" $([^{$} \*] [^{\[}$ ]+)\][^\(]")
	url_link_pattern = re.compile(r" $([^{$} \*] [^{\[}$ ]+) (https?://[^\]]+)\]")
	tag_pattern = re.compile(r'#([^\s]+)')
	img_pattern = re.compile(r'! $(h t t p s ? : / / . +)$ ')
	large_math_pattern = re.compile(r'\s-\s+($[^$]+$)\s$')

	if len(sys.argv) < 3:
	print('Usage: scrapbox.py <file> <date>')
	sys.exit(1)

	print(sys.argv)

	imgs = []
	lines = []
	tags = []
	in_snippets = False
	with open(sys.argv[1], 'r') as f:
	lines = f.readlines()
	for i, line in enumerate(lines):
	if len(line.replace(" ","")) <= 1: continue

	if line.startswith("```"): in_snippets = not in_snippets
	if in_snippets: continue

	for j in range(6):
	hash = "".join([''](6-j))
	t = f"[{hash} ["
	has_t = t in lines[i]
	lines[i] = lines[i].replace(t,"![](")
	if has_t:
	lines[i] = lines[i][:-3] + ')\n'

	line = lines[i]
	head_match = head_pattern.finditer(line)
	math_match = math_pattern.finditer(line)
	links_match = link_pattern.finditer(line)
	tags_match = tag_pattern.finditer(line)
	url_links_match = url_link_pattern.finditer(line)

	if head_match:
	targets = set()
	for _link in head_match:
	link = _link.group(1)
	targets.add(link)

	for link in targets:
	lines[i] = lines[i].replace(f"[*# {link}]", f"### {link}")

	if math_match:
	targets = set()
	for _link in math_match:
	link = _link.group(1)
	targets.add(link)

	for link in targets:
	update_link = link.replace("_",r"\_")
	lines[i] = lines[i].replace(f"{link}]", f" $u p d a t e_{l} i n k$ ").replace("[$","")

	if url_links_match:
	targets = set()
	for url_link in url_links_match:
	title = url_link.group(1)
	url = url_link.group(2)
	targets.add((title,url))

	for title, url in targets:
	lines[i] = lines[i].replace(f"[{title} {url}]",f"[{title}]({url})")

	if links_match:
	targets = set()
	for _link in links_match:
	link = _link.group(1)
	targets.add(link)

	for link in targets:
	md_path = link.replace(' ','_').replace('?','').replace('!','').replace(':','') + ".md"
	md = '{{< ref "' + md_path + '" >}}'
	if os.path.exists(md_path):
	lines[i] = lines[i].replace(f"[{link}]",f"[{link}]({md}/)")
	else:
	lines[i] = lines[i].replace(f"[{link}]",f"[{link}](https://scrapbox.io/yuwd/{quote(link)})")

	if tags_match and i == 1:
	for _tag in tags_match:
	tag = _tag.group(1)
	tags.append(tag)
	lines[i] = lines[i].replace(f"#{tag}",f"")

	img_match = img_pattern.finditer(lines[i])
	if img_match:
	for img in img_match:
	url = img.group(1)
	img_url = url
	exts = ['png','gif','jpeg','jpg','webp']
	if "https://gyazo.com/" in url:
	for e in exts:
	if url.endswith(e):
	img_url = url
	break
	img_url = f"{url}.{e}"
	res = requests.get(img_url)
	if res.status_code == 200:
	break
	lines[i] = lines[i].replace(f"![]({url})",'{{<' + f'img src="{img_url}" position="center"' + '>}}<br>')
	imgs.append(img_url)

	large_math_match = large_math_pattern.finditer(lines[i])
	if large_math_match:
	for _math in large_math_match:
	math = _math.group(1)
	lines[i] = f" $m a t h$ \n"



	meta = \
	"""
	---
	title: "TITLE"
	date: DATE
	description:
	draft: false
	hideToc: false
	enableToc: true
	enableTocContent: true
	tocPosition: inner
	tags:
	TAGS
	series:
	-
	libraries:
	- katex
	- mermaid
	- msc
	TOP_IMAGE
	---
	"""

	title = lines[0].replace('\n','')
	if "paper" in tags:
	title = f"【論文メモ】{title}"
	else:
	tags.append("post")

	print("TAG",tags)
	date = datetime.datetime.fromtimestamp(int(sys.argv[2])).strftime('%Y-%m-%dT%H:%M:%S+09:00')
	meta = meta.replace("TITLE", title)
	meta = meta.replace("DATE", date)
	meta = meta.replace("TAGS", "\n".join(map(lambda x: f"- {x}",tags)))
	if len(imgs) > 0:
	meta = meta.replace("TOP_IMAGE", f"image: {imgs[0]}")
	else:
	meta = meta.replace("TOP_IMAGE", "# image: None")


	lines = list(map(lambda x: x + '\n', meta.split('\n'))) + lines[2:]

	with open(sys.argv[1], 'w') as f:
	f.writelines(lines)
	# for line in lines:
	# print(line,end='')

view raw scrapbox2hugo.py hosted with ❤ by GitHub

著者

YuWd (Yuiga Wada)

機械学習・競プロ・iOS・Web

scrapboxとHugoを同期させる

関連記事