Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2614d27
Agregar la app markup_doc para gestión y marcación de documentos DOCX
eduranm Apr 16, 2026
e5a6783
Agregar archivos DOCX de ejemplo para pruebas del flujo de marcación
eduranm Apr 16, 2026
3ddf5ca
Registrar markup_doc
eduranm Apr 16, 2026
a4d2fa6
Registrar markuplib y agregar utilidades base para lectura estructura…
eduranm Apr 19, 2026
adbb38d
Agregar utilidades de etiquetado y marcación automática de referencias
eduranm Apr 19, 2026
b3a5141
Integrar procesamiento automático de referencias en las tareas de mar…
eduranm Apr 19, 2026
fc28a0b
Disparar el procesamiento automático del DOCX desde el flujo de creac…
eduranm Apr 19, 2026
40ee0cb
Ajustar la respuesta de Gemini en el servicio de inferencia de model_ai
eduranm Apr 19, 2026
fb9da61
Agregar API first_block para procesar el bloque inicial del artículo
eduranm Apr 19, 2026
0cd08bb
Agregar utilidades para consultar first_block y extraer palabras clave
eduranm Apr 19, 2026
ad725c4
Integrar la identificación de elementos del front en get_labels
eduranm Apr 19, 2026
37b8362
Agregar utilidades para estructurar contenido especial en markup_doc
eduranm Apr 20, 2026
7e76a43
Agregar extracción de contenido especial desde DOCX en markuplib
eduranm Apr 20, 2026
03e2778
Integrar contenido especial del cuerpo en get_labels
eduranm Apr 20, 2026
1cd4475
Add missing extract_label_and_title helper
eduranm Apr 20, 2026
ab42655
Agregar generador XML para documentos marcados en markup_doc
eduranm Apr 21, 2026
8e9c6b1
Integrar generación y actualización de XML en las tareas de markup_doc
eduranm Apr 21, 2026
2ceccb1
Agregar utilidades para empaquetado SPS y resolución de assets
eduranm Apr 21, 2026
892a387
Agregar vistas y recursos estáticos para descarga y previsualización …
eduranm Apr 21, 2026
86cc68f
Registrar vistas de XML en Wagtail y regenerar el XML al editar
eduranm Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
from rest_framework.routers import DefaultRouter, SimpleRouter

from reference.api.v1.views import ReferenceViewSet

app_name = "reference"
from markup_doc.api.v1.views import ArticleViewSet

if settings.DEBUG:
router = DefaultRouter()
else:
router = SimpleRouter()

router.register("reference", ReferenceViewSet, basename="reference")
router.register("first_block", ArticleViewSet, basename="first_block")

urlpatterns = router.urls
2 changes: 2 additions & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@
"reference",
"xml_manager",
"model_ai",
"markup_doc",
"markuplib",
]

INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS + WAGTAIL
Expand Down
Binary file added fixtures/Artigo 5.docx
Binary file not shown.
Binary file added fixtures/e14790.docx
Binary file not shown.
Binary file added fixtures/e740.docx
Binary file not shown.
Empty file added markup_doc/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions markup_doc/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from django.contrib import admin

# Register your models here.
Empty file added markup_doc/api/__init__.py
Empty file.
Empty file added markup_doc/api/v1/__init__.py
Empty file.
7 changes: 7 additions & 0 deletions markup_doc/api/v1/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from rest_framework import serializers
from markup_doc.models import ArticleDocx

class ArticleDocxSerializer(serializers.ModelSerializer):
class Meta:
model = ArticleDocx
fields = "__all__"
43 changes: 43 additions & 0 deletions markup_doc/api/v1/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from django.shortcuts import render
from django.http import JsonResponse
from rest_framework.permissions import IsAuthenticated
from rest_framework.viewsets import GenericViewSet
from rest_framework.mixins import CreateModelMixin
from rest_framework.response import Response
from markup_doc.api.v1.serializers import ArticleDocxSerializer
from markup_doc.marker import mark_article

import json

# Create your views here.

class ArticleViewSet(
GenericViewSet, # generic view functionality
CreateModelMixin, # handles POSTs
):
serializer_class = ArticleDocxSerializer
permission_classes = [IsAuthenticated]
http_method_names = [
"post",
]

def create(self, request, *args, **kwargs):
return self.api_article(request)

def api_article(self, request):
try:
data = json.loads(request.body)
post_text = data.get('text') # Obtiene el parámetro
post_metadata = data.get('metadata') # Obtiene el parámetro

resp_data = mark_article(post_text, post_metadata)

response_data = {
'message': resp_data,
}
except json.JSONDecodeError:
response_data = {
'error': 'Error processing'
}

return JsonResponse(response_data)
6 changes: 6 additions & 0 deletions markup_doc/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from django.apps import AppConfig


class MarkupDocConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "markup_doc"
121 changes: 121 additions & 0 deletions markup_doc/choices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
front_labels = [
('<abstract>', '<abstract>'),
('<abstract-title>', '<abstract-title>'),
('<aff>', '<aff>'),
('<article-id>', '<article-id>'),
('<article-title>', '<article-title>'),
('<author-notes>', '<author-notes>'),
('<contrib>', '<contrib>'),
('<date-accepted>', '<date-accepted>'),
('<date-received>', '<date-received>'),
('<fig>', '<fig>'),
('<fig-attrib>', '<fig-attrib>'),
('<history>', '<history>'),
('<kwd-title>', '<kwd-title>'),
('<kwd-group>', '<kwd-group>'),
('<list>', '<list>'),
('<p>', '<p>'),
('<sec>', '<sec>'),
('<sub-sec>', '<sub-sec>'),
('<subject>', '<subject>'),
('<table>', '<table>'),
('<table-foot>', '<table-foot>'),
('<title>', '<title>'),
('<trans-abstract>', '<trans-abstract>'),
('<trans-title>', '<trans-title>'),
('<translate-front>', '<translate-front>'),
('<translate-body>', '<translate-body>'),
('<disp-formula>', '<disp-formula>'),
('<inline-formula>', '<inline-formula>'),
('<formula>', '<formula>'),

]

order_labels = {
'<article-id>':{
'pos' : 1,
'next' : '<subject>'
},
'<subject>':{
'pos' : 2,
'next' : '<article-title>'
},
'<article-title>':{
'pos' : 3,
'next' : '<trans-title>',
'lan' : True
},
'<trans-title>':{
'size' : 14,
'bold' : True,
'lan' : True,
'next' : '<contrib>'
},
'<contrib>':{
'reset' : True,
'size' : 12,
'next' : '<aff>'
},
'<aff>':{
'reset' : True,
'size' : 12,
},
'<abstract>':{
'size' : 12,
'bold' : True,
'lan' : True,
'next' : '<p>'
},
'<p>':{
'size' : 12,
'next' : '<p>',
'repeat' : True
},
'<trans-abstract>':{
'size' : 12,
'bold' : True,
'lan' : True,
'next' : '<p>'
},
'<kwd-group>':{
'size' : 12,
'regex' : r'(?i)(palabra.*clave.*:|keyword.*:)',
},
'<history>':{
'size' : 12,
'regex' : r'\d{2}/\d{2}/\d{4}',
},
'<corresp>':{
'size' : 12,
'regex' : r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
},
'<sec>':{
'size' : 16,
'bold' : True,
'next' : None
},
'<sub-sec>':{
'size' : 12,
'italic' : True,
'next' : None
},
'<sub-sec-2>':{
'size' : 14,
'bold' : True,
'next' : None
},
}

order_labels_body = {
'<sec>':{
'size' : 16,
'bold' : True,
},
'<sub-sec>':{
'size' : 12,
'italic' : True,
},
'<p>':{
'size' : 12,
},
}
1 change: 1 addition & 0 deletions markup_doc/forms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from wagtail.admin.forms.models import WagtailAdminModelForm
150 changes: 150 additions & 0 deletions markup_doc/issue_proc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from lxml import etree
from urllib.parse import urlparse
from packtools.sps.pid_provider.xml_sps_lib import get_xml_with_pre
import os


class Asset:
def __init__(self, wagtail_image):
self.file = wagtail_image.file # tiene .path (ruta absoluta)
self.original_href = wagtail_image.file.name # nombre en el storage


class XmlIssueProc:
def __init__(self, registro):
self.registro = registro
self.xmltree = self._extract_xml_tree()
self.journal_proc = self._extract_journal_proc()
self.issue_folder = self._extract_issue_folder()

def _extract_xml_tree(self):
return get_xml_with_pre(self.registro.text_xml).xmltree

def _extract_journal_proc(self):
acron = self.xmltree.findtext(".//journal-id[@journal-id-type='publisher-id']")
return type("JournalProc", (), {"acron": acron or "journal"})

def _get_issn(self):
issn = self.xmltree.findtext(".//issn[@pub-type='epub']")
if not issn:
issn = self.xmltree.findtext(".//issn[@pub-type='ppub']")
return issn

def _extract_issue_folder(self, lot=None):
issn = self._get_issn() or ""
acron = self.journal_proc.acron or ""
vol = (self.xmltree.findtext(".//volume") or "").strip()
issue = (self.xmltree.findtext(".//issue") or "").strip().lower()
year = self.xmltree.findtext(".//pub-date[@date-type='collection']/year")

parts = [p for p in [issn, acron] if p]

# volumen
if vol:
parts.append(f"v{vol}")

# issue puede ser número, suplemento o especial
if issue:
if issue.startswith("suppl"):
# suplemento de volumen → v10s2
parts[-1] = parts[-1] + f"s{issue.replace('suppl','').strip()}"
elif "suppl" in issue:
# suplemento de número → v10n4s2
tokens = issue.split()
num = tokens[0]
sup = tokens[1:]
parts.append(f"n{num}")
sup_num = "".join(sup).replace("suppl", "").strip()
parts[-1] = parts[-1] + f"s{sup_num}"
elif issue.startswith("spe"):
# número especial → v10nspe1
parts[-1] = parts[-1] + f"nspe{issue.replace('spe','').strip()}"
else:
# número normal → v4n10
parts.append(f"n{issue}")

# carpeta de publicación continua con lote
if lot and year:
lot_str = f"{lot:02d}{year[-2:]}"
parts.append(lot_str)

return "-".join(parts)

def build_pkg_name(self, lang=None):
issn = self._get_issn() or ""
acron = self.journal_proc.acron or ""

# base igual que issue_folder, pero sin el ISSN y acron aún
vol = (self.xmltree.findtext(".//volume") or "").strip()
issue = (self.xmltree.findtext(".//issue") or "").strip().lower()

parts = [issn, acron]

if vol:
parts.append(vol)

if issue:
if issue.startswith("suppl"):
# suplemento de volumen
parts[-1] = parts[-1] + f"s{issue.replace('suppl','').strip()}"
elif "suppl" in issue:
# suplemento de número
tokens = issue.split()
num = tokens[0]
sup = tokens[1:]
parts.append(num)
sup_num = "".join(sup).replace("suppl", "").strip()
parts[-1] = parts[-1] + f"s{sup_num}"
elif issue.startswith("spe"):
# número especial
parts[-1] = parts[-1] + f"nspe{issue.replace('spe','').strip()}"
else:
# número normal
parts.append(issue)

# ARTID
elocation = self.xmltree.findtext(".//elocation-id")
fpage = self.xmltree.findtext(".//fpage")
pid = self.xmltree.findtext(".//article-id[@specific-use='scielo-v2']")

if elocation:
parts.append(elocation.strip())
elif fpage:
parts.append(fpage.strip())
elif pid:
parts.append(pid.strip())
else:
parts.append("na") # fallback si no hay nada

# idioma solo si es traducción
if lang:
parts.append(lang)

return "-".join(parts)

def find_asset(self, basename, name):
"""
Devuelve las imágenes del StreamField como Asset
si coinciden con el nombre puesto en el XML (original_filename)
o con el nombre real en storage.
"""
assets = []
if self.registro.content_body:
for block in self.registro.content_body:
if block.block_type == "image" and block.value:
wagtail_image = block.value.get("image")
if not wagtail_image:
continue

# Nombre real en storage (ej: foto1.abcd1234.jpg)
storage_basename = os.path.basename(wagtail_image.file.name)

# Nombre usado en el XML (ej: foto1.jpg)
original_url = wagtail_image.get_rendition("original").url
xml_basename = os.path.basename(urlparse(original_url).path)

# Si coincide con cualquiera → se acepta
if basename in (storage_basename, xml_basename):
assets.append(Asset(wagtail_image))

return assets
Loading
Loading