-
Scrapboxと個人ブログ(Hugo)を同期させるようにした.
- scrapboxとクローラでも言及したが, 空のリンクに検索がヒットするのはよくないと思い, 同期を始めた.
- scrapbox自体は書き心地やUXが最高で手放したくないため, 一部ページを同期させ, 正しく検索結果が載るか試してみる.
-
コードは以下に示す通り.
- scrapbox記法からmarkdownへの変換はこちらを改変したものを使用.
- javascriptの
replace
は割と万能である.
- javascriptの
- shell script / python / Node.js が動く環境があればOK.
- scrapbox記法からmarkdownへの変換はこちらを改変したものを使用.
|
|
|
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sys | |
import datetime | |
import requests | |
from urllib.parse import quote | |
head_pattern = re.compile(r" |
|
math_pattern = re.compile(r" |
|
link_pattern = re.compile(r" |
|
url_link_pattern = re.compile(r" |
|
tag_pattern = re.compile(r'#([^\s]+)') | |
img_pattern = re.compile(r'! |
|
large_math_pattern = re.compile(r'\s*-\s+($[^$]+$)\s*$') | |
if len(sys.argv) < 3: | |
print('Usage: scrapbox.py <file> <date>') | |
sys.exit(1) | |
print(sys.argv) | |
imgs = [] | |
lines = [] | |
tags = [] | |
in_snippets = False | |
with open(sys.argv[1], 'r') as f: | |
lines = f.readlines() | |
for i, line in enumerate(lines): | |
if len(line.replace(" ","")) <= 1: continue | |
if line.startswith("```"): in_snippets = not in_snippets | |
if in_snippets: continue | |
for j in range(6): | |
hash = "".join(['*']*(6-j)) | |
t = f"[{hash} [" | |
has_t = t in lines[i] | |
lines[i] = lines[i].replace(t," | |
if has_t: | |
lines[i] = lines[i][:-3] + ')\n' | |
line = lines[i] | |
head_match = head_pattern.finditer(line) | |
math_match = math_pattern.finditer(line) | |
links_match = link_pattern.finditer(line) | |
tags_match = tag_pattern.finditer(line) | |
url_links_match = url_link_pattern.finditer(line) | |
if head_match: | |
targets = set() | |
for _link in head_match: | |
link = _link.group(1) | |
targets.add(link) | |
for link in targets: | |
lines[i] = lines[i].replace(f"[*# {link}]", f"### {link}") | |
if math_match: | |
targets = set() | |
for _link in math_match: | |
link = _link.group(1) | |
targets.add(link) | |
for link in targets: | |
update_link = link.replace("_",r"\_") | |
lines[i] = lines[i].replace(f"{link}]", f" |
|
if url_links_match: | |
targets = set() | |
for url_link in url_links_match: | |
title = url_link.group(1) | |
url = url_link.group(2) | |
targets.add((title,url)) | |
for title, url in targets: | |
lines[i] = lines[i].replace(f"[{title} {url}]",f"[{title}]({url})") | |
if links_match: | |
targets = set() | |
for _link in links_match: | |
link = _link.group(1) | |
targets.add(link) | |
for link in targets: | |
md_path = link.replace(' ','_').replace('?','').replace('!','').replace(':','') + ".md" | |
md = '{{< ref "' + md_path + '" >}}' | |
if os.path.exists(md_path): | |
lines[i] = lines[i].replace(f"[{link}]",f"[{link}]({md}/)") | |
else: | |
lines[i] = lines[i].replace(f"[{link}]",f"[{link}](https://scrapbox.io/yuwd/{quote(link)})") | |
if tags_match and i == 1: | |
for _tag in tags_match: | |
tag = _tag.group(1) | |
tags.append(tag) | |
lines[i] = lines[i].replace(f"#{tag}",f"") | |
img_match = img_pattern.finditer(lines[i]) | |
if img_match: | |
for img in img_match: | |
url = img.group(1) | |
img_url = url | |
exts = ['png','gif','jpeg','jpg','webp'] | |
if "https://gyazo.com/" in url: | |
for e in exts: | |
if url.endswith(e): | |
img_url = url | |
break | |
img_url = f"{url}.{e}" | |
res = requests.get(img_url) | |
if res.status_code == 200: | |
break | |
lines[i] = lines[i].replace(f"",'{{<' + f'img src="{img_url}" position="center"' + '>}}<br>') | |
imgs.append(img_url) | |
large_math_match = large_math_pattern.finditer(lines[i]) | |
if large_math_match: | |
for _math in large_math_match: | |
math = _math.group(1) | |
lines[i] = f" |
|
meta = \ | |
""" | |
--- | |
title: "TITLE" | |
date: DATE | |
description: | |
draft: false | |
hideToc: false | |
enableToc: true | |
enableTocContent: true | |
tocPosition: inner | |
tags: | |
TAGS | |
series: | |
- | |
libraries: | |
- katex | |
- mermaid | |
- msc | |
TOP_IMAGE | |
--- | |
""" | |
title = lines[0].replace('\n','') | |
if "paper" in tags: | |
title = f"【論文メモ】{title}" | |
else: | |
tags.append("post") | |
print("TAG",tags) | |
date = datetime.datetime.fromtimestamp(int(sys.argv[2])).strftime('%Y-%m-%dT%H:%M:%S+09:00') | |
meta = meta.replace("TITLE", title) | |
meta = meta.replace("DATE", date) | |
meta = meta.replace("TAGS", "\n".join(map(lambda x: f"- {x}",tags))) | |
if len(imgs) > 0: | |
meta = meta.replace("TOP_IMAGE", f"image: {imgs[0]}") | |
else: | |
meta = meta.replace("TOP_IMAGE", "# image: None") | |
lines = list(map(lambda x: x + '\n', meta.split('\n'))) + lines[2:] | |
with open(sys.argv[1], 'w') as f: | |
f.writelines(lines) | |
# for line in lines: | |
# print(line,end='') |