martes, 28 de mayo de 2024

Modelado de tópicos

BTB tesis. Introduccion

Subo la data para este análisis

moderna <- textreadr::read_document("G:/episteme_moderna.txt")

moderna <- textreadr::read_document("G:/episteme_moderna.txt")

En ocasiones la codificación del texto cambia automáticamente a

ASCII y necesito que esté en UTF-8. Arreglo encondign

moderna <- tau::fixEncoding(moderna)
moderna <- tau::fixEncoding(moderna)

Subo el paquete BTM

library(BTM)
spa_model <- udpipe_download_model(language = "spanish")
spa_model <- udpipe_load_model(spa_model$file_model)

Selecciono el modelo y preparo el marco de datos:


moderna_df <- tibble(id = paste0("tag-",1:length(moderna)),
                     texto = moderna) 


moderna_tag <- udpipe_annotate(spa_model, x = moderna_df$texto,
                         doc_id = moderna_df$id)

moderna_tag <- as.data.frame(moderna_tag)
stats <- txt_freq(moderna_tag$upos)

stats$key <- factor(stats$key,
                    levels = rev(stats$key))

Visualizo las partes de la oración capturadas y segmentadas:



stats %>% 
      ggplot(aes(freq, key))+
      geom_col() +
      labs(title = "Episteme moderna", subtitle = "UPOS (Universal Parts of Speech)\n según frecuencia", x = "Frecuencia", y= "UPOs")

Selecciono los sustantivos y, posteriormente, los adjetivos:


nombres <- subset(moderna_tag, upos %in% c("NOUN")) 
nombres <- txt_freq(nombres$token)
nombres$key <- factor(nombres$key, levels = rev(nombres$key))


nombres[1:10,] %>% 
   mutate(freq_pct = round(freq_pct,3)) %>% 
      ggplot(aes(freq_pct, key))+
      geom_col() +
      labs(title = "Episteme clásica", subtitle = "Nombres según frecuencia relativa", x = "Frecuencia (tanto por cien)", y= "Nombres")


adjetivos <- subset(moderna_tag, upos %in% c("ADJ")) 
adejetivos <- txt_freq(adjetivos$token)
adejetivos$key <- factor(adejetivos$key, 
                         levels = rev(adejetivos$key))


adejetivos[1:10,] %>% 
   mutate(freq_pct = round(freq_pct,4)) %>% 
      ggplot(aes(freq_pct, key))+
      geom_col() +
      labs(title = "Episteme moderna", subtitle = "Adjetivos según frecuencia relativa", x = "Frecuencia (tanto por cien)", y= "Adjetivos")

Búsqueda de palabras claves

Aplico el Rapid Automatic Keyword Extraction, RAKE, para extraer palabras claves


clasic_rake <- keywords_rake(x = moderna_tag, term = "lemma", 
                          group = "doc_id", 
                       relevant = moderna_tag$upos %in% c("NOUN",
                                                   "ADJ"))

clasic_rake$key <- factor(clasic_rake$keyword, 
                       levels= rev(clasic_rake$keyword))


clasic_rake[c(1:15),] %>% 
   mutate(rake = round(rake,4)) %>% 
      ggplot(aes(rake, key))+
      geom_col() +
      labs(title = "Episteme clásica, 1968-1999", subtitle = "Palabras claves según RAKE", x = "RAKE", y= "Bigrama")

Secuencia de nombres-verbos, verbos-frases


moderna_tag$phrase_tag <- as_phrasemachine(moderna_tag$upos, 
                                     type = "upos")

clasic_stats <- keywords_phrases(x = moderna_tag$phrase_tag, 
                              term = tolower(moderna_tag$token), 
                          pattern = "(A|N)*N(P+D*(A|N)*N)*", 
                          is_regex = TRUE, detailed = FALSE)

clasic_stats <- subset(clasic_stats, ngram > 1 & freq > 2)

clasic_stats$key <- factor(clasic_stats$keyword, 
                        levels = rev(clasic_stats$keyword))


clasic_stats %>% 
      ggplot(aes(freq, key))+
      geom_col() +
      labs(title = "Episteme moderna", 
           subtitle = "Palabras claves según coocurrencias frecuentes", x = "Frecuencia", y= "Ngrama")

Coocurrencias




cooc <- cooccurrence(x = subset(moderna_tag, upos %in% c("NOUN",
                                                   "ADJ")), 
                     term = "lemma", 
                     group = c("doc_id", "paragraph_id",
                               "sentence_id"))
wordnetwork <- head(cooc, 30)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
  geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "darkblue") +
  geom_node_text(aes(label = name), col = "darkgreen", 
                 size = 4) +
  #theme_graph(base_family = "Arial Narrow") +
  theme(legend.position = "none") +
  labs(title = "Cooccurrencia palabras en la misma oración", subtitle = "Sustantivos y adjetivos")



cooc2 <- cooccurrence(moderna_tag$lemma, 
                  relevant = moderna_tag$upos %in% c("NOUN",
                                               "ADJ"), 
                  skipgram = 1)

wordnetwork2 <- head(cooc2, 30)
wordnetwork2 <- graph_from_data_frame(wordnetwork2)
ggraph(wordnetwork2, layout = "fr") +
  geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "violet") +
  geom_node_text(aes(label = name), col = "darkgreen", 
                 size = 4) +
  #theme_graph(base_family = "Arial Narrow") +
  theme(legend.position = "none") +
  labs(title = "Cooccurrencia de palabras", 
       subtitle = "Sustantivos y adjetivos")

Correlaciones


moderna_tag[1:10, c("token_id", "token", "head_token_id", "dep_rel")]

x <- cbind_dependencies(moderna_tag, type = "parent")

dim(x)
[1] 4503   20
nominalsubject <- subset(x, dep_rel %in% c("nsubj"))
dim(nominalsubject)
[1] 163  20
nominalsubject <- nominalsubject[, c("dep_rel", "token",
                                     "token_parent")]
nominalsubject
NA
LS0tDQp0aXRsZTogIkJUQiB0ZXNpcy4gSW50cm9kdWNjaW9uIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KYGBge3IsIGV2YWw9RkFMU0UgfQ0KbW9kZXJuYSA8LSB0ZXh0cmVhZHI6OnJlYWRfZG9jdW1lbnQoIkc6L2xhLWJpYmxpb3RlY2EvamVzdXMvYXB1bnRlcy9lcGlzdGVtZV9tb2Rlcm5hLnR4dCIpDQoNCm1vZGVybmEgPC0gdGV4dHJlYWRyOjpyZWFkX2RvY3VtZW50KCJHOi9sYS1iaWJsaW90ZWNhL2plc3VzL2FwdW50ZXMvZXBpc3RlbWVfbW9kZXJuYS50eHQiKQ0KYGBgDQoNCkFycmVnbG8gZW5jb25kaWduDQpgYGB7cn0NCm1vZGVybmEgPC0gdGF1OjpmaXhFbmNvZGluZyhtb2Rlcm5hKQ0KbW9kZXJuYSA8LSB0YXU6OmZpeEVuY29kaW5nKG1vZGVybmEpDQpgYGANCg0KDQpgYGB7cn0NCmxpYnJhcnkoQlRNKQ0KYGBgDQoNCg0KYGBge3J9DQpzcGFfbW9kZWwgPC0gdWRwaXBlX2Rvd25sb2FkX21vZGVsKGxhbmd1YWdlID0gInNwYW5pc2giKQ0Kc3BhX21vZGVsIDwtIHVkcGlwZV9sb2FkX21vZGVsKHNwYV9tb2RlbCRmaWxlX21vZGVsKQ0KYGBgDQoNCmBgYHtyfQ0KDQptb2Rlcm5hX2RmIDwtIHRpYmJsZShpZCA9IHBhc3RlMCgidGFnLSIsMTpsZW5ndGgobW9kZXJuYSkpLA0KICAgICAgICAgICAgICAgICAgICAgdGV4dG8gPSBtb2Rlcm5hKSANCg0KDQptb2Rlcm5hX3RhZyA8LSB1ZHBpcGVfYW5ub3RhdGUoc3BhX21vZGVsLCB4ID0gbW9kZXJuYV9kZiR0ZXh0bywNCiAgICAgICAgICAgICAgICAgICAgICAgICBkb2NfaWQgPSBtb2Rlcm5hX2RmJGlkKQ0KDQptb2Rlcm5hX3RhZyA8LSBhcy5kYXRhLmZyYW1lKG1vZGVybmFfdGFnKQ0KDQpgYGANCg0KDQpgYGB7ciBlc3RhZGlzdGljYXNCYXNpY2FzLCBldmFsPUZBTFNFfQ0KDQpzdGF0cyA8LSB0eHRfZnJlcShtb2Rlcm5hX3RhZyR1cG9zKQ0KDQpzdGF0cyRrZXkgPC0gZmFjdG9yKHN0YXRzJGtleSwNCiAgICAgICAgICAgICAgICAgICAgbGV2ZWxzID0gcmV2KHN0YXRzJGtleSkpDQpgYGANCg0KDQpgYGB7ciBlc3RzQmFzaWNhcywgY2FjaGU9VFJVRX0NCg0KDQpzdGF0cyAlPiUgDQogICAgICBnZ3Bsb3QoYWVzKGZyZXEsIGtleSkpKw0KICAgICAgZ2VvbV9jb2woKSArDQogICAgICBsYWJzKHRpdGxlID0gIkVwaXN0ZW1lIG1vZGVybmEiLCBzdWJ0aXRsZSA9ICJVUE9TIChVbml2ZXJzYWwgUGFydHMgb2YgU3BlZWNoKVxuIHNlZ8O6biBmcmVjdWVuY2lhIiwgeCA9ICJGcmVjdWVuY2lhIiwgeT0gIlVQT3MiKQ0KDQpgYGANCg0KDQoNCmBgYHtyIG9idGVuZ29Ob21icmVzfQ0KDQpub21icmVzIDwtIHN1YnNldChtb2Rlcm5hX3RhZywgdXBvcyAlaW4lIGMoIk5PVU4iKSkgDQpub21icmVzIDwtIHR4dF9mcmVxKG5vbWJyZXMkdG9rZW4pDQpub21icmVzJGtleSA8LSBmYWN0b3Iobm9tYnJlcyRrZXksIGxldmVscyA9IHJldihub21icmVzJGtleSkpDQoNCg0KYGBgDQoNCg0KYGBge3IgZXN0c0Jhc05vbWJyZXN9DQoNCg0Kbm9tYnJlc1sxOjEwLF0gJT4lIA0KICAgbXV0YXRlKGZyZXFfcGN0ID0gcm91bmQoZnJlcV9wY3QsMykpICU+JSANCiAgICAgIGdncGxvdChhZXMoZnJlcV9wY3QsIGtleSkpKw0KICAgICAgZ2VvbV9jb2woKSArDQogICAgICBsYWJzKHRpdGxlID0gIkVwaXN0ZW1lIG1vZGVybmEiLCBzdWJ0aXRsZSA9ICJOb21icmVzIHNlZ8O6biBmcmVjdWVuY2lhIHJlbGF0aXZhIiwgeCA9ICJGcmVjdWVuY2lhICh0YW50byBwb3IgY2llbikiLCB5PSAiTm9tYnJlcyIpDQoNCmBgYA0KDQoNCmBgYHtyIG9idGVuZ29BZGpldGl2b3N9DQoNCmFkamV0aXZvcyA8LSBzdWJzZXQobW9kZXJuYV90YWcsIHVwb3MgJWluJSBjKCJBREoiKSkgDQphZGVqZXRpdm9zIDwtIHR4dF9mcmVxKGFkamV0aXZvcyR0b2tlbikNCmFkZWpldGl2b3Mka2V5IDwtIGZhY3RvcihhZGVqZXRpdm9zJGtleSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgbGV2ZWxzID0gcmV2KGFkZWpldGl2b3Mka2V5KSkNCg0KDQpgYGANCg0KDQoNCmBgYHtyIGVzdHNCYXNBZGp9DQoNCg0KYWRlamV0aXZvc1sxOjEwLF0gJT4lIA0KICAgbXV0YXRlKGZyZXFfcGN0ID0gcm91bmQoZnJlcV9wY3QsNCkpICU+JSANCiAgICAgIGdncGxvdChhZXMoZnJlcV9wY3QsIGtleSkpKw0KICAgICAgZ2VvbV9jb2woKSArDQogICAgICBsYWJzKHRpdGxlID0gIkVwaXN0ZW1lIG1vZGVybmEiLCBzdWJ0aXRsZSA9ICJBZGpldGl2b3Mgc2Vnw7puIGZyZWN1ZW5jaWEgcmVsYXRpdmEiLCB4ID0gIkZyZWN1ZW5jaWEgKHRhbnRvIHBvciBjaWVuKSIsIHk9ICJBZGpldGl2b3MiKQ0KDQpgYGANCg0KDQoNCg0KIyMgQsO6c3F1ZWRhIGRlIHBhbGFicmFzIGNsYXZlcw0KDQoNClBhbGFicmFzIGNsYXZlcyBkZSBhY3VlcmRvIGNvbiBlbCBSYXBpZCBBdXRvbWF0aWMgS2V5d29yZCBFeHRyYWN0aW9uLCBSQUtFDQoNCg0KYGBge3IgY2xhc2ljUmFrZX0NCg0KY2xhc2ljX3Jha2UgPC0ga2V5d29yZHNfcmFrZSh4ID0gbW9kZXJuYV90YWcsIHRlcm0gPSAibGVtbWEiLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgZ3JvdXAgPSAiZG9jX2lkIiwgDQogICAgICAgICAgICAgICAgICAgICAgIHJlbGV2YW50ID0gbW9kZXJuYV90YWckdXBvcyAlaW4lIGMoIk5PVU4iLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIkFESiIpKQ0KDQpjbGFzaWNfcmFrZSRrZXkgPC0gZmFjdG9yKGNsYXNpY19yYWtlJGtleXdvcmQsIA0KICAgICAgICAgICAgICAgICAgICAgICBsZXZlbHM9IHJldihjbGFzaWNfcmFrZSRrZXl3b3JkKSkNCg0KDQpgYGANCg0KDQoNCmBgYHtyIGNsYXNpY3Jha2UsIGNhY2hlPVRSVUV9DQoNCg0KY2xhc2ljX3Jha2VbYygxOjE1KSxdICU+JSANCiAgIG11dGF0ZShyYWtlID0gcm91bmQocmFrZSw0KSkgJT4lIA0KICAgICAgZ2dwbG90KGFlcyhyYWtlLCBrZXkpKSsNCiAgICAgIGdlb21fY29sKCkgKw0KICAgICAgbGFicyh0aXRsZSA9ICJFcGlzdGVtZSBtb2Rlcm5hIiwgc3VidGl0bGUgPSAiUGFsYWJyYXMgY2xhdmVzIHNlZ8O6biBSQUtFIiwgeCA9ICJSQUtFIiwgeT0gIkJpZ3JhbWEiKQ0KDQpgYGANCg0KDQoNCg0KU2VjdWVuY2lhIGRlIG5vbWJyZXMtdmVyYm9zLCB2ZXJib3MtZnJhc2VzIA0KDQpgYGB7ciBjbGFzaWNDb29jfQ0KDQptb2Rlcm5hX3RhZyRwaHJhc2VfdGFnIDwtIGFzX3BocmFzZW1hY2hpbmUobW9kZXJuYV90YWckdXBvcywgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHlwZSA9ICJ1cG9zIikNCg0KY2xhc2ljX3N0YXRzIDwtIGtleXdvcmRzX3BocmFzZXMoeCA9IG1vZGVybmFfdGFnJHBocmFzZV90YWcsIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdGVybSA9IHRvbG93ZXIobW9kZXJuYV90YWckdG9rZW4pLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgcGF0dGVybiA9ICIoQXxOKSpOKFArRCooQXxOKSpOKSoiLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgaXNfcmVnZXggPSBUUlVFLCBkZXRhaWxlZCA9IEZBTFNFKQ0KDQpjbGFzaWNfc3RhdHMgPC0gc3Vic2V0KGNsYXNpY19zdGF0cywgbmdyYW0gPiAxICYgZnJlcSA+IDIpDQoNCmNsYXNpY19zdGF0cyRrZXkgPC0gZmFjdG9yKGNsYXNpY19zdGF0cyRrZXl3b3JkLCANCiAgICAgICAgICAgICAgICAgICAgICAgIGxldmVscyA9IHJldihjbGFzaWNfc3RhdHMka2V5d29yZCkpDQoNCg0KYGBgDQoNCg0KDQpgYGB7ciBtb2Rlcm5hQ29vYywgY2FjaGU9VFJVRX0NCg0KDQpjbGFzaWNfc3RhdHMgJT4lIA0KICAgICAgZ2dwbG90KGFlcyhmcmVxLCBrZXkpKSsNCiAgICAgIGdlb21fY29sKCkgKw0KICAgICAgbGFicyh0aXRsZSA9ICJFcGlzdGVtZSBtb2Rlcm5hIiwgDQogICAgICAgICAgIHN1YnRpdGxlID0gIlBhbGFicmFzIGNsYXZlcyBzZWfDum4gY29vcnJ1Y2VuaWNhcyBmcmVjdWVudGVzIiwgeCA9ICJGcmVjdWVuY2lhIiwgeT0gIk5ncmFtYSIpDQoNCmBgYA0KDQoNCiMjIENvb2N1cnJlbmNpYXMNCg0KYGBge3IgbGFzQ29vY30NCg0KDQoNCmNvb2MgPC0gY29vY2N1cnJlbmNlKHggPSBzdWJzZXQobW9kZXJuYV90YWcsIHVwb3MgJWluJSBjKCJOT1VOIiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJBREoiKSksIA0KICAgICAgICAgICAgICAgICAgICAgdGVybSA9ICJsZW1tYSIsIA0KICAgICAgICAgICAgICAgICAgICAgZ3JvdXAgPSBjKCJkb2NfaWQiLCAicGFyYWdyYXBoX2lkIiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAic2VudGVuY2VfaWQiKSkNCg0KYGBgDQoNCg0KYGBge3IgbGFzQ29vY0csIGNhY2hlPVRSVUV9DQp3b3JkbmV0d29yayA8LSBoZWFkKGNvb2MsIDMwKQ0Kd29yZG5ldHdvcmsgPC0gZ3JhcGhfZnJvbV9kYXRhX2ZyYW1lKHdvcmRuZXR3b3JrKQ0KZ2dyYXBoKHdvcmRuZXR3b3JrLCBsYXlvdXQgPSAiZnIiKSArDQogIGdlb21fZWRnZV9saW5rKGFlcyh3aWR0aCA9IGNvb2MsIGVkZ2VfYWxwaGEgPSBjb29jKSwgZWRnZV9jb2xvdXIgPSAiZGFya2JsdWUiKSArDQogIGdlb21fbm9kZV90ZXh0KGFlcyhsYWJlbCA9IG5hbWUpLCBjb2wgPSAiZGFya2dyZWVuIiwgDQogICAgICAgICAgICAgICAgIHNpemUgPSA0KSArDQogICN0aGVtZV9ncmFwaChiYXNlX2ZhbWlseSA9ICJBcmlhbCBOYXJyb3ciKSArDQogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9ICJub25lIikgKw0KICBsYWJzKHRpdGxlID0gIkNvb2NjdXJyZW5jZGUgcGFsYWJyYXMgZW4gbGEgbWlzbWEgb3JhY2nDs24iLCBzdWJ0aXRsZSA9ICJTdXN0YW50aXZvcyB5IGFkamV0aXZvcyIpDQoNCmBgYA0KDQoNCg0KDQpgYGB7ciBsYXNDb29jaX0NCg0KDQpjb29jMiA8LSBjb29jY3VycmVuY2UobW9kZXJuYV90YWckbGVtbWEsIA0KICAgICAgICAgICAgICAgICAgcmVsZXZhbnQgPSBtb2Rlcm5hX3RhZyR1cG9zICVpbiUgYygiTk9VTiIsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJBREoiKSwgDQogICAgICAgICAgICAgICAgICBza2lwZ3JhbSA9IDEpDQoNCg0KYGBgDQoNCg0KYGBge3IgbGFzQ29vY0dpLCBjYWNoZT1UUlVFfQ0KDQp3b3JkbmV0d29yazIgPC0gaGVhZChjb29jMiwgMzApDQp3b3JkbmV0d29yazIgPC0gZ3JhcGhfZnJvbV9kYXRhX2ZyYW1lKHdvcmRuZXR3b3JrMikNCmdncmFwaCh3b3JkbmV0d29yazIsIGxheW91dCA9ICJmciIpICsNCiAgZ2VvbV9lZGdlX2xpbmsoYWVzKHdpZHRoID0gY29vYywgZWRnZV9hbHBoYSA9IGNvb2MpLCBlZGdlX2NvbG91ciA9ICJ2aW9sZXQiKSArDQogIGdlb21fbm9kZV90ZXh0KGFlcyhsYWJlbCA9IG5hbWUpLCBjb2wgPSAiZGFya2dyZWVuIiwgDQogICAgICAgICAgICAgICAgIHNpemUgPSA0KSArDQogICN0aGVtZV9ncmFwaChiYXNlX2ZhbWlseSA9ICJBcmlhbCBOYXJyb3ciKSArDQogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9ICJub25lIikgKw0KICBsYWJzKHRpdGxlID0gIkNvb2NjdXJyZW5jaWEgZGUgcGFsYWJyYXMiLCANCiAgICAgICBzdWJ0aXRsZSA9ICJTdXN0YW50aXZvcyB5IGFkamV0aXZvcyIpDQoNCmBgYA0KDQoNCiMjIENvcnJlbGFjaW9uZXMNCmBgYHtyLCBjYWNoZT1UUlVFfQ0KDQptb2Rlcm5hX3RhZ1sxOjEwLCBjKCJ0b2tlbl9pZCIsICJ0b2tlbiIsICJoZWFkX3Rva2VuX2lkIiwgImRlcF9yZWwiKV0NCg0KeCA8LSBjYmluZF9kZXBlbmRlbmNpZXMobW9kZXJuYV90YWcsIHR5cGUgPSAicGFyZW50IikNCg0KZGltKHgpDQoNCm5vbWluYWxzdWJqZWN0IDwtIHN1YnNldCh4LCBkZXBfcmVsICVpbiUgYygibnN1YmoiKSkNCmRpbShub21pbmFsc3ViamVjdCkNCg0Kbm9taW5hbHN1YmplY3QgPC0gbm9taW5hbHN1YmplY3RbLCBjKCJkZXBfcmVsIiwgInRva2VuIiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAidG9rZW5fcGFyZW50IildDQpub21pbmFsc3ViamVjdA0KDQpgYGA=

No hay comentarios.: